En este notebook se crea un modelo para el AAP dadas unas situaciones atmosféricas.
# Manejo de datos
import pandas as pd
# Análisis de datos
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Modelado
from pycaret.classification import *
import xgboost
# Auxiliares
# Fecha y hora
from datetime import datetime
# Numéricos
import numpy as np
# Texto
import re
# Uso de modelos
import joblib
# Representaciones gráficas
# Graficos
import matplotlib.pyplot as plt
# Mapas
import geopandas
import folium
from folium import plugins
A continuación se carga en un dataframe los datos extraídos de la fuente de estudio guardados en formato parquet por su gran volumen de datos.
# Directorio de los datos
data_dir = 'Datasets/PHA/'
pha_file = 'PHA_Data.parquet'
legend_file = 'column_info.csv'
model_info_dir = 'Forecast/'
# Lectura del fichero .parquet creado todo el conjunto de datos
data_df = pd.read_parquet(data_dir + pha_file, engine='pyarrow')
# Dataframe y diccionario leyenda de las columnas
df_legend = pd.read_csv(data_dir + legend_file)
leyend_dict = dict(zip(df_legend['Column_name'], df_legend['Description']))
Dado que el numero de columnas es elevado y alguna de estas no aporta información a nuestro caso de estudio, se realiza primero la selección de los atributos de estudio dejando así un dataframe que sea más reducido en cuanto a sus atributos de partida para poder trabajar el analisis de datos con mayor facilidad.
Como primer estudio para ver la relación de los datos y ayudarnos a tomar la decisión de los atributos a seleccionar, se observa la relación de las columnas de forma gráfica. Esto es, la dependencia que tiene una columna con repecto a las otras si estas varian.
Para este estudio, se emplea el metodo corr de Pandas. Pero, previamente se requiere codificar las variables no numéricas ya que esta función, únicamente actúa sobre las columnas numéricas, y es este momento, queremos ver la relación de todas las columnas. Para ello, se hace uso del modulo preprocessing de scikit-learn. Con este modulo se puede hacer uso de métodos que nos permitirá codificar los atributos categóricas a numéricos.
Se utiliza el encoder OrdinalEncoder que transforma el valor en el ordinal para la columna, dando como resultado valores numericos de 0 a n-1 caracterisitica.
# Instancia del codificador
oe = OrdinalEncoder()
# Ajuste del modelo (fit) y codificación de los datos (transform)
oe.fit(data_df)
data_encoded = oe.transform(data_df)
# Crear un dataframe auxiliar con los atributos y valores transformados
names = oe.get_feature_names_out()
df_encoded = pd.DataFrame(data_encoded, columns=names)
df_encoded
| c1 | c6 | c7 | c8 | c10 | c132 | c134 | c136 | c138 | c141 | ... | c150 | c154 | c156 | c158 | c161 | c163 | c183 | c191 | c229 | c230 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 1937.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 2.0 | 2.0 | 11.0 | 13.0 | 33.0 | 33.0 | 1.0 | 1.0 | NaN | NaN |
| 1 | 0.0 | 1.0 | 0.0 | 0.0 | 1937.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 2.0 | 2.0 | 11.0 | 12.0 | 33.0 | 33.0 | 1.0 | 1.0 | NaN | NaN |
| 2 | 0.0 | 1.0 | 0.0 | 0.0 | 1937.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 2.0 | 2.0 | 11.0 | 12.0 | 33.0 | 33.0 | 1.0 | 1.0 | NaN | NaN |
| 3 | 0.0 | 1.0 | 0.0 | 0.0 | 1937.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 2.0 | 2.0 | 11.0 | 12.0 | 33.0 | 33.0 | 1.0 | 1.0 | NaN | NaN |
| 4 | 0.0 | 1.0 | 0.0 | 0.0 | 1937.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 2.0 | 2.0 | 11.0 | 13.0 | 33.0 | 33.0 | 1.0 | 1.0 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 214659 | 3.0 | 48.0 | 1.0 | 5.0 | 1100.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 3.0 | 2.0 | 11.0 | 12.0 | 33.0 | 33.0 | 1.0 | 1.0 | 5275.0 | NaN |
| 214660 | 3.0 | 48.0 | 1.0 | 6.0 | 967.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 2.0 | 2.0 | 11.0 | 12.0 | 33.0 | 33.0 | 1.0 | 1.0 | 3582.0 | NaN |
| 214661 | 3.0 | 48.0 | 0.0 | 30.0 | 519.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 3.0 | 2.0 | 11.0 | 12.0 | 33.0 | 33.0 | 1.0 | 1.0 | 6608.0 | 1257.0 |
| 214662 | 3.0 | 48.0 | 0.0 | 6.0 | 251.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 3.0 | 2.0 | 11.0 | 12.0 | 33.0 | 33.0 | 1.0 | 1.0 | 2107.0 | NaN |
| 214663 | 0.0 | 47.0 | 10.0 | 17.0 | 714.0 | 26.0 | 17.0 | 26.0 | 14.0 | 9.0 | ... | 3.0 | 2.0 | 11.0 | 11.0 | 33.0 | 33.0 | 1.0 | 1.0 | 2770.0 | NaN |
214664 rows × 149 columns
corr = df_encoded.corr()
#corr.style.background_gradient(cmap='Blues')
# Eliminar la diagonal principal y los valores debajo de esta, manteniendo solo los valores superiores
# para ver solo valores unicos diferentes de 1 (relacion con diferentes atributos)
corr_filter = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool_))
# Ordenar de mayor a menor
corr_sort = corr_filter.stack().sort_values(ascending=False).to_frame()
# Añadir nombres a las columnas para entender las relaciones
corr_sort = corr_sort.rename(columns=leyend_dict, index=leyend_dict)
corr_sort.head(60)
| 0 | ||
|---|---|---|
| Including crew and passengers. | Number of passengers onboard | 0.999322 |
| Region of the air operator | District office of the air operator | 0.994042 |
| Region of the accident/incident location. | District office of the accident/incident location. | 0.986830 |
| Code for Weight Class | Powered, Nonpowered, Optional | 0.969616 |
| Under or over 750 hp. | Type of engine. | 0.967706 |
| Type of the engine code. | Piston, turbine, turboprop, etc. | 0.960989 |
| Residence region code of the pilot in command | Residence district office code of the pilot in command | 0.958168 |
| Under or over 750 hp. | Type of the engine code. | 0.951808 |
| Type of the engine code. | Type of engine. | 0.947604 |
| Text for element C153. | 0.940691 | |
| Latitude coordinates of the accident/incident | Longitude coordinates of the accident/incident | 0.939748 |
| Second remedial action area code | Second remedial action area text | 0.938856 |
| Second remedial action taken code | 0.932946 | |
| Number of passengers injured | All injuries including crew passengers and public. | 0.929659 |
| Second persons involved code | Supporting cause factor B code. OPERATIONAL | 0.926073 |
| Number of passengers killed | Total Fatalities | 0.924392 |
| Second remedial action taken code | Second remedial action area text | 0.921989 |
| Persons involved code | Supporting cause factor A code. OPERATIONAL | 0.918476 |
| Piston, turbine, turboprop, etc. | Type of engine. | 0.913665 |
| Under or over 750 hp. | Piston, turbine, turboprop, etc. | 0.913075 |
| Second persons involved code | Second contributing factor text | 0.906583 |
| Code for Aircraft Design | Wing information of the aircraft involved in the investigation. | 0.900926 |
| Piston, turbine, turboprop, etc. | Text for element C153. | 0.900355 |
| Powered, Nonpowered, Optional | Text for element C150 | 0.899042 |
| Supporting factor code TECHNICAL | Supporting factor text TECHNICAL | 0.895825 |
| Wing information of the aircraft involved in the investigation. | Powered, Nonpowered, Optional | 0.887031 |
| Code for Weight Class | Wing information of the aircraft involved in the investigation. | 0.884601 |
| Persons involved code | Persons involved text | 0.884581 |
| Supporting cause factor A code. OPERATIONAL | Persons involved text | 0.879966 |
| Supporting cause factor B code. OPERATIONAL | Second contributing factor text | 0.879617 |
| Powered, Nonpowered, Optional | Wing information of the aircraft text | 0.877517 |
| Under or over 750 hp. | Certificating region of the engine | 0.875991 |
| Persons involved code | Contributing factor text | 0.873574 |
| Type of the engine code. | Certificating region of the engine | 0.867094 |
| Code for Weight Class | Code representing typical engine design characteristics of a specific aircraft model. | 0.863927 |
| Wing information of the aircraft text | 0.862388 | |
| Code for Aircraft Design | Code for Weight Class | 0.858151 |
| Including crew and passengers. | Number of cabin crew members onboard | 0.857341 |
| Under or over 750 hp. | Text for element C153. | 0.856725 |
| First remedial action taken code | First remedial action area text | 0.853943 |
| Certificating region of the engine | Type of engine. | 0.851722 |
| Supporting cause factor B code. OPERATIONAL | Second persons involved text | 0.850641 |
| Supporting cause factor A code. OPERATIONAL | Contributing factor text | 0.849209 |
| Number of passengers onboard | Number of cabin crew members onboard | 0.845135 |
| Any other rating code of the pilot in command . | Date of birth of second Pilot | 0.844683 |
| Contributing factor text | Persons involved text | 0.843510 |
| Second remedial action taken code | Second remedial action taken text | 0.842445 |
| Number of ground crew members killed | Number of public/third party persons killed | 0.842082 |
| Certificating region of the engine | Piston, turbine, turboprop, etc. | 0.837519 |
| Second contributing factor text | Second persons involved text | 0.830486 |
| Second persons involved code | Second persons involved text | 0.827027 |
| Second contributing case code f | Second contributing factor text | 0.825891 |
| Second supporting factor code . TECHNICAL | Second supporting factor text TECHNICAL | 0.819069 |
| Text for element C153. | Type of engine. | 0.818072 |
| Certificating region of the engine | Text for element C153. | 0.817312 |
| Code for Aircraft Design | Code representing typical engine design characteristics of a specific aircraft model. | 0.817296 |
| 2nd Additional cause factor code | 2nd Additional cause factor text | 0.795763 |
| Code for Weight Class | Text for element C150 | 0.794342 |
| General cause category code | Light condition text | 0.792735 |
| Wing information of the aircraft involved in the investigation. | Wing information of the aircraft text | 0.791066 |
Las primeras 60 correlaciones que se obtiene hace referencia a atributos que no son relevantes para el caso de estudio de este modelo. Como se ve, en gran parte hace referencia a personas muertas o heridas en los sucesos, dando una gran correlacion a las personas tripulantes y pasajeros, que por razones obvias, se tiene una gran correlación.
Para seguir con el estudio de la correlación, se muestra otro fragmento de resultados.
# Siguientes 60 resultados
corr_sort.iloc[60:120]
| 0 | ||
|---|---|---|
| Primary type of flying code | Primary flying condition code | 0.785883 |
| Code for Aircraft Design | Powered, Nonpowered, Optional | 0.785133 |
| Wing information of the aircraft involved in the investigation. | Text for element C150 | 0.774048 |
| Total number of hours in the last 90 days the pilot has flown this make and model | Total number of hours in the last 90 days the pilot has flown. | 0.770009 |
| Second supporting factor code . TECHNICAL | Second cause factor text | 0.769300 |
| Primary type of flying code | Light condition text | 0.767437 |
| Wing information of the aircraft text | Text for element C150 | 0.762513 |
| Primary type of flying code | Secondary type of flying code | 0.761877 |
| Primary flying condition code | General cause category code | 0.761451 |
| Second remedial action area text | Second remedial action taken text | 0.759669 |
| Primary flying condition code | Light condition text | 0.758806 |
| Certificate number of the air agency | District office of the air operator | 0.758416 |
| Primary type of flying code | General cause category code | 0.752481 |
| Year the accident/incident happened. | Qualification code of the pilot in command . | 0.748940 |
| General cause category code | General cause category text | 0.742242 |
| Powered, Nonpowered, Optional | Code representing typical engine design characteristics of a specific aircraft model. | 0.737555 |
| Make of the engine | Model of the engine | 0.734207 |
| Second remedial action area code | Second remedial action taken text | 0.734046 |
| Certificate number of the air agency | Region of the air operator | 0.729220 |
| Powered, Nonpowered, Optional | Aid Text | 0.725186 |
| Second cause factor text | Second supporting factor text TECHNICAL | 0.722129 |
| Second contributing case code f | Second persons involved code | 0.721485 |
| Wind direction | Wind speed in miles per hours | 0.713187 |
| Wing information of the aircraft involved in the investigation. | Code representing typical engine design characteristics of a specific aircraft model. | 0.712037 |
| Secondary cause factor code | Second cause factor text | 0.709523 |
| Group of the aircraft | Region of certification for the aircraft | 0.708361 |
| Code representing typical engine design characteristics of a specific aircraft model. | Wing information of the aircraft text | 0.707777 |
| Code for Aircraft Design | Wing information of the aircraft text | 0.707625 |
| Wing information of the aircraft involved in the investigation. | Low wing, high wing, rotary wing, etc. | 0.707518 |
| Primary type of flying code | Type of accident code | 0.699291 |
| Contributing factor code | Contributing factor text | 0.695022 |
| Under or over 750 hp. | Make of the engine | 0.694418 |
| Primary type of flying code | Rating description of the pilot in command | 0.691014 |
| Primary type of flying text | Secondary type of flying text | 0.687593 |
| State of the owner / operator | Certificate number of the air agency | 0.685613 |
| Second contributing case code f | Supporting cause factor B code. OPERATIONAL | 0.684939 |
| Code for Aircraft Design | Number of engines on aircraft | 0.684833 |
| Designator code of the air operator (OPCD) | Certificate number of the air agency | 0.682313 |
| Code for Weight Class | Number of engines on aircraft | 0.673069 |
| Code for Aircraft Design | Text for element C150 | 0.671971 |
| Type of Landing Gear code. | Type of landing gear. | 0.667748 |
| Additional cause factor code | Additional cause factor text | 0.667574 |
| General cause category text | Light condition text | 0.663380 |
| State of the owner / operator | District office of the air operator | 0.663298 |
| Gust indicator flag | Gust speed in miles per hour | 0.663103 |
| Type of the engine code. | Make of the engine | 0.661287 |
| Year the accident/incident happened. | Light condition text | 0.659318 |
| Make of the engine | Type of engine. | 0.657873 |
| Light condition code | Light condition text | 0.657086 |
| Supporting cause factor B code. OPERATIONAL | Supporting cause factor B text. OPERATIONAL | 0.654122 |
| Qualification code of the pilot in command . | Qualification description of the pilot in command | 0.652020 |
| Primary type of flying code | Qualification code of the pilot in command . | 0.650067 |
| Profession code of the pilot in command | Profession description of the pilot in command | 0.649749 |
| Second contributing case code f | Second persons involved text | 0.648105 |
| Primary cause factor code | General cause category text | 0.647377 |
| Primary flying condition code | Qualification code of the pilot in command . | 0.646046 |
| Aid Text | Wing information of the aircraft text | 0.645631 |
| Year the accident/incident happened. | Primary flying condition code | 0.645156 |
| Name of the owner / operator of aircraft | Designator code of the air operator (OPCD) | 0.644877 |
| Year the accident/incident happened. | General cause category code | 0.642552 |
Las siguientes correlaciones, ya aporta u nmayor numero de relacione sinterresantes para el estudio del modelo. Se puede extraer que las Primary flying condition tienen una importante correlación con el resultado del suceso.
Además, se observa que atributos climáticos como velocidad y direccion del viento toman relevancia en la relación de atributos.
También se observa que el tipo de vuelo y la calificación del piloto tienen cierta relación. Esto se considera interesante ya que se entiende de la calificación/experiencia de un piloto puede ser determinante en determinadas situaciones climáticas.
Para seguir con el estudio de la correlación, se muestra otro fragmento de resultados.
# Siguientes 60 resultados
corr_sort.iloc[120:180]
| 0 | ||
|---|---|---|
| State of the owner / operator | Region of the air operator | 0.642507 |
| Rating description of the pilot in command | Any other rating description of the pilot in command | 0.642503 |
| Secondary flying condition code | Secondary flying condition text | 0.642084 |
| General cause category code | Primary flying condition text | 0.641682 |
| Primary flying condition code | Rating description of the pilot in command | 0.639773 |
| Year the accident/incident happened. | Primary type of flying code | 0.639728 |
| Code for Aircraft Design | Low wing, high wing, rotary wing, etc. | 0.638445 |
| Make of the engine | Piston, turbine, turboprop, etc. | 0.636906 |
| Qualification code of the pilot in command . | Light condition text | 0.631259 |
| Primary flying condition code | Type of accident code | 0.630292 |
| Supporting cause factor A code. OPERATIONAL | Supporting Factor Text OPERATIONAL | 0.627523 |
| Code for Weight Class | Aid Text | 0.621231 |
| Group of the aircraft | Low wing, high wing, rotary wing, etc. | 0.618484 |
| General cause category code | Any other rating description of the pilot in command | 0.618110 |
| Sky condition code | Wind direction | 0.615710 |
| Contributing factor code | Persons involved code | 0.611084 |
| Primary flying condition code | Secondary flying condition code | 0.610887 |
| Qualification code of the pilot in command . | General cause category code | 0.610070 |
| Make of the engine | Text for element C153. | 0.609853 |
| Sky condition code | Wind speed in miles per hours | 0.608882 |
| Rating description of the pilot in command | Light condition text | 0.606178 |
| Primary flying condition code | Any other rating description of the pilot in command | 0.606139 |
| Profession code of the pilot in command | Any other rating description of the pilot in command | 0.605513 |
| Model of the aircraft | Group of the aircraft | 0.605425 |
| General cause category code | Rating description of the pilot in command | 0.603962 |
| Qualification code of the pilot in command . | Profession code of the pilot in command | 0.601655 |
| Airworthiness class code of the aircraft | This field contains the year of manufacture of the aircraft | 0.601494 |
| Primary flying condition code | General cause category text | 0.601327 |
| First remedial action area code | First remedial action taken code | 0.601279 |
| Qualification code of the pilot in command . | Rating description of the pilot in command | 0.600971 |
| Primary flying condition code | Primary flying condition text | 0.599709 |
| Type of accident code | Light condition text | 0.594636 |
| Low wing, high wing, rotary wing, etc. | Primary type of flying code | 0.594010 |
| Contributing factor code | Supporting cause factor A code. OPERATIONAL | 0.592817 |
| Wing information of the aircraft involved in the investigation. | Aid Text | 0.592628 |
| Year the accident/incident happened. | Number of flight crew members onboard | 0.591576 |
| Profession code of the pilot in command | 0.590258 | |
| Any other rating description of the pilot in command | Light condition text | 0.589424 |
| Year the accident/incident happened. | Low wing, high wing, rotary wing, etc. | 0.588017 |
| Secondary type of flying code | Rating description of the pilot in command | 0.587555 |
| Group of the engine | Text for element C153. | 0.583609 |
| Phase of flight code | Phase of flight text | 0.582067 |
| General cause category code | Emergency Locator Transmitter on Board? Y or N | 0.580920 |
| Type of the engine code. | Group of the engine | 0.579681 |
| Aid Text | Text for element C150 | 0.578572 |
| Contributing factor code | Persons involved text | 0.575559 |
| Primary cause factor code | General cause category code | 0.574529 |
| Number of ground crew members injured | Number of public/third party persons injured | 0.572808 |
| Under or over 750 hp. | Group of the engine | 0.569256 |
| General cause category text | Primary flying condition text | 0.568902 |
| Code for Weight Class | Low wing, high wing, rotary wing, etc. | 0.568510 |
| Powered, Nonpowered, Optional | Low wing, high wing, rotary wing, etc. | 0.568341 |
| Primary flying condition text | Light condition text | 0.568058 |
| Secondary type of flying code | Primary flying condition code | 0.567609 |
| Qualification code of the pilot in command . | Profession description of the pilot in command | 0.566565 |
| Primary type of flying code | General cause category text | 0.566038 |
| Emergency Locator Transmitter on Board? Y or N | Light condition text | 0.565441 |
| Year the accident/incident happened. | Profession description of the pilot in command | 0.564238 |
| Model of the engine | Group of the engine | 0.563629 |
| Type of accident code | General cause category code | 0.562016 |
Finalmente, se observa como nuevamente toma peso las condiciones climaticas así como las condiciones del vuelo y la calificación del piloto.
Teniendo en cuenta las correlaciones y el modelo de datos necesario para la predicción, las columnas seleccionadas han sido decididas en base a:
def col_name(col: str):
"""
Nombre de la columna
"""
return df_legend[df_legend["Column_name"] == col]["Description"].values[0]
def clean_data(value):
"""
Limpieza de los datos None a NaN. Si es un String, eliminar espacios finales e iniciales
"""
# Si se trata de un valor en blanco sustituir a NaN
try:
# si es un string, eliminar espacios
if isinstance(value, str):
value = value.strip()
if value in [None, 'nan', 'N/A', pd.NA]:
return np.NaN
return value
except Exception as e:
return value
# Columnas seleccionadas
columns = [1, 6, 7, 10, 11, 12, 13, 14, 20, 21, 30, 31, 32, 41, 49, 56, 96, 106, 108, 110, 112, 113, 114, 115,
240, 241, 242, 243]
columns = [f'c{col}' for col in columns]
for col in columns:
print(f'[+] SELECTED_COL\t{col}\t{col_name(col)}')
# Eliminar columnas indeseadas
df = data_df.drop(columns=[col for col in data_df.columns if col not in columns])
# Iterar sobre todas las columnas y comprobar sus valores
columns = len(df.columns)
print('\n')
for column in df.columns:
# Utilizar compresion de listas y aplicar el filtrado de valores
values = [clean_data(value) for value in df[column]]
df[column] = values
columns -= 1
print(f'[+] REMAINING_CLEANING\t{columns}\t', end='\r')
print(' '*50)
df
[+] SELECTED_COL c1 Type of Event
[+] SELECTED_COL c6 Year the accident/incident happened.
[+] SELECTED_COL c7 Month the accident/incident happened.
[+] SELECTED_COL c10 Local time of the accident/incident.
[+] SELECTED_COL c11 Region of the accident/incident location.
[+] SELECTED_COL c12 District office of the accident/incident location.
[+] SELECTED_COL c13 State of the accident/incident location.
[+] SELECTED_COL c14 City of the accident/incident location.
[+] SELECTED_COL c20 Latitude coordinates of the accident/incident
[+] SELECTED_COL c21 Longitude coordinates of the accident/incident
[+] SELECTED_COL c30 Airworthiness class code of the aircraft
[+] SELECTED_COL c31 Airframe hours of the aircraft
[+] SELECTED_COL c32 This field contains the year of manufacture of the aircraft
[+] SELECTED_COL c41 Certificate type code of the pilot in command
[+] SELECTED_COL c49 Qualification code of the pilot in command .
[+] SELECTED_COL c56 Total number of hours the pilot has flown.
[+] SELECTED_COL c96 Phase of flight code
[+] SELECTED_COL c106 Primary flying condition code
[+] SELECTED_COL c108 Secondary flying condition code
[+] SELECTED_COL c110 Light condition code
[+] SELECTED_COL c112 Sky condition code
[+] SELECTED_COL c113 Cloud ceiling
[+] SELECTED_COL c114 Visibility code
[+] SELECTED_COL c115 Visibility restriction code.
[+] SELECTED_COL c240 Wind direction
[+] SELECTED_COL c241 Wind speed in miles per hours
[+] SELECTED_COL c242 Gust indicator flag
[+] SELECTED_COL c243 Gust speed in miles per hour
| c1 | c6 | c7 | c10 | c30 | c31 | c32 | c11 | c12 | c13 | ... | c114 | c115 | c240 | c241 | c242 | c243 | c41 | c49 | c56 | c96 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A | 1975 | 1 | NaN | STRD | NaN | * | CE | CE11 | MO | ... | NaN | NaN | NaN | NaN | NaN | NaN | 03 | E | 1187.0 | IB |
| 1 | A | 1975 | 1 | NaN | STRD | NaN | * | GL | GL19 | IL | ... | NaN | NaN | NaN | NaN | NaN | NaN | 03 | G | 364.0 | DD |
| 2 | A | 1975 | 1 | NaN | NaN | NaN | * | SO | SO62 | TN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 01 | G | 205.0 | DD |
| 3 | A | 1975 | 1 | NaN | NaN | NaN | * | GL | GL18 | IN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 01 | G | 95.0 | HB |
| 4 | A | 1975 | 1 | NaN | STRD | NaN | * | NW | NW02 | OR | ... | NaN | NaN | NaN | NaN | NaN | NaN | 03 | G | 233.0 | IA |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 214659 | I | 2022 | 2 | 1710 | NaN | 929 | * | CE | CE19 | TN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 01 | NaN | 90.0 | TC |
| 214660 | I | 2022 | 2 | 1505 | STRD | 1823 | * | GL | GL05 | OH | ... | NaN | NaN | NaN | NaN | NaN | NaN | 01 | NaN | 413.0 | LR |
| 214661 | I | 2022 | 1 | 0832 | STRD | 970 | * | NM | NM07 | UT | ... | NaN | NaN | NaN | NaN | NaN | NaN | 03 | NaN | 1551.0 | TD |
| 214662 | I | 2022 | 1 | 0407 | STRD | NaN | * | WP | WP17 | NY | ... | NaN | NaN | NaN | NaN | NaN | NaN | 09 | NaN | 14190.0 | LT |
| 214663 | A | 2021 | 11 | 1130 | STRD | 1532 | * | EA | EA13 | PA | ... | NaN | NaN | NaN | NaN | NaN | NaN | 01 | NaN | 609.0 | LA |
214664 rows × 28 columns
Para tener un contexto de los datos que se tiene, se realiza una serie de visualización de datos.
La primera de estas acciones consiste en describir los datos que tenemos, en su tipo y cuántos de estos datos son válidos, o al menos, no desconocidos (NaN)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 214664 entries, 0 to 214663 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 c1 214635 non-null object 1 c6 214664 non-null int64 2 c7 214632 non-null object 3 c10 193377 non-null object 4 c30 40022 non-null object 5 c31 120371 non-null object 6 c32 46392 non-null object 7 c11 214157 non-null object 8 c12 214073 non-null object 9 c13 213887 non-null object 10 c14 212818 non-null object 11 c20 59517 non-null object 12 c21 59471 non-null object 13 c106 171955 non-null object 14 c108 171696 non-null object 15 c110 168469 non-null float64 16 c112 90250 non-null object 17 c113 26396 non-null object 18 c114 88222 non-null object 19 c115 18177 non-null object 20 c240 73644 non-null object 21 c241 72778 non-null object 22 c242 10638 non-null object 23 c243 10532 non-null object 24 c41 200888 non-null object 25 c49 156141 non-null object 26 c56 177616 non-null float64 27 c96 207603 non-null object dtypes: float64(2), int64(1), object(25) memory usage: 45.9+ MB
De forma gráfica, y desde el concepto de porcebtaje de valores NaN en las columnas, se muestra el siguiente gráfico.
def plot_nan(df):
"""
Imprimir el porcentaje de valores faltantes
"""
def df_info_perc(df) -> dict:
"""
Obtener el porcentaje de datos faltantes para plotear
"""
# Diccionario con datos
info_dic = {}
# Numero de datos faltantes para las diferentes columnas
rows = len(df)
# print('\tCOLUMN\tNAN\tPERC')
for col in df:
nan_values = df[col].isna().sum()
perc = round((nan_values/rows) * 100, 2)
# print(f'[+]\t{col}\t{nan_values}\t{perc}%')
# Almacenar datos descriptivos
info_dic.update({col: perc})
return info_dic
# Describir los datos faltantes
info = df_info_perc(df)
# Mostrar informacion
plt.figure(figsize=(10, 5))
plt.bar(info.keys(), info.values())
plt.xlabel('Atributos')
plt.xticks(rotation=45)
plt.ylabel('% NaN')
plt.title('Valores NaN')
plt.show()
plot_nan(df)
El modelo que se desea realizar, debe de ser capaz de predecir el tipo de incidente que ocurrirá. Para ver si tenemos unos datos balanceados, a continuación se muestra una gráfica de la predicción a realizar, en donde podemos observa que si existe un balance de datos.
ax = df.c1.value_counts().plot(kind='bar', title='Tipo de evento')
ax.set_ylabel('Frecuencia')
# Rotar nombres de atributos
for label in ax.xaxis.get_ticklabels():
label.set_rotation(15)
plt.show()
A continuación se muestra el balanceo de datos para la columna del mes del incidente. Teniendo en cuenta el tipo de análisis a realizar, se realiza una previa visualización de la distribución de los sucesos por esta catacteristica.
# Obtener la serie de conteos y ordenar los índices alfabéticamente
serie_counts = df.c7.value_counts().sort_index()
# Reordenar los valores de la serie según el nuevo orden de los índices
serie_counts = serie_counts.reindex(serie_counts.index.sort_values())
# Generar el gráfico de barras
ax = serie_counts.plot(kind='bar', title='Mes del incidente')
ax.set_ylabel('Frecuencia')
# Rotar nombres de atributos
for label in ax.xaxis.get_ticklabels():
label.set_rotation(15)
plt.show()
Se considera de relevancia conocer la ubicación del avion en el momento del suceso, es por esto que se añade los atributos de latitud y longitud al conjunto de características a trabajar en el dataset. No obstante, estos atributos son presentados de forma categórica, por lo que para poder emplearlos, se convierte a numérico.
Esto se debe a que las coordenas están presentados en grados y minutos en su origen, por lo tanto, estos valores se convierten a coordenadas decimales como se presenta a continuación.
def convert_coords(coord):
"""
Transformar las coordenadas en Grados y Minutos a Coordenadas decimales
"""
err = 999
dir_tuple = ('N', 'S', 'E', 'W')
# Las coordenadas en el dataframe se dan en string. Si no es string,
# devolver valor atípico 0
if isinstance(coord, str):
# Encontrar los valores numéricos
coord = coord.strip()
match = re.findall(r'[\d.]+', coord)
# Son datos de coordenada válidos
if not coord.endswith(dir_tuple) or not match:
return err
direction = coord[-1:]
# Si viene Grados y Minutos sin espacio
if len(match) == 1:
grade, minute = 0, 0
if len(match[0]) == 5:
grade = match[0][:3]
minute = match[0][3:5]
elif len(match[0]) == 4:
grade = match[0][:2]
minute = match[0][2:4]
if grade or minute:
match = [grade, minute]
# Si viene separado Grados de Minutos
if len(match) == 2:
match = [float(match[0]), float(match[1])]
# Comprobación de Grados
if direction in ['N', 'S']:
if match[0] < 0 or 90 < match[0]:
return err
if direction in ['E', 'W']:
if match[0] < 0 or 180 < match[0]:
return err
# Comprobacion de Minutos
if match[1] < 0 or 59 < match[1]:
return err
# Convertir a decimal
decimal_coord = match[0] + match[1]/60
# Convertir dirección
return decimal_coord if direction in ['N', 'E'] else decimal_coord * -1
return err
# Conertir las coordenadas a decimales
df['c20'] = df['c20'].apply(convert_coords)
df['c21'] = df['c21'].apply(convert_coords)
# Crear los puntos geometricos para el mapa a partir de Latitud y Longitud
geometry = geopandas.points_from_xy(df.c20, df.c21)
geo_df = geopandas.GeoDataFrame(df[["c20", "c21"]], geometry=geometry)
# Crear instancia de mapa de folium
geo_map = folium.Map(location=[0, 0], tiles="Cartodb dark_matter", zoom_start=2)
# Crear lista de puntos de coordenadas que sean diferentes a 999
heat_data = [[point.xy[0][0], point.xy[1][0]] for point in geo_df.geometry if point.xy[1][0] != 999 and point.xy[0][0] != 999]
# Añadir al mapa los puntos como Heatmap
plugins.HeatMap(heat_data).add_to(geo_map)
geo_map
Dado el elevado numero de datos faltantes y sabiendo que existe valores que están en columnas diferentes por el estudio realizado para obtener el dataset de trabajo. A continuación se detalla los valores que existe para cada uno de los atributos del modelo de datos obtenido.
# Iterar las columnas y mostrar los valores únicos
for col in df:
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c1 Type of Event VALUES ['A' 'I' nan 'AERO SAFETY FLIGHT SCHOOL' 'UNIVERSAL FLIGHT TRAINING' 'DULLES AVIATION'] [+] c6 Year the accident/incident happened. VALUES [1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 0 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022] [+] c7 Month the accident/incident happened. VALUES [1 2 3 4 5 6 7 10 9 8 11 12 <NA>] [+] c10 Local time of the accident/incident. VALUES [nan '0000' '1515' ... '4105' '0129' '0239'] [+] c30 Airworthiness class code of the aircraft VALUES ['STRD' nan 'REST' 'EXPT' 'MULT' 'LIMT' 'PROV'] [+] c31 Airframe hours of the aircraft VALUES [nan '660.0' '28.0' ... '9887' '10009' '10905'] [+] c32 This field contains the year of manufacture of the aircraft VALUES ['*' nan] [+] c11 Region of the accident/incident location. VALUES ['CE' 'GL' 'SO' 'NW' 'SW' 'WE' 'RM' 'PC' 'NE' 'EA' 'AL' 'EU' 'WP' 'NM' nan 'AS' 'WA' '99' 'SD' 'FS'] [+] c12 District office of the accident/incident location. VALUES ['CE11' 'GL19' 'SO62' 'GL18' 'NW02' 'SW09' 'NW61' 'GL07' 'SO05' 'GL05' 'GL13' 'GL20' 'WE62' 'NW03' 'WE11' 'GL03' 'SO09' 'SW07' 'CE04' 'RM03' 'GL14' 'PC51' 'NE13' 'SW06' 'CE22' 'RM67' 'EA16' 'SO07' 'SW08' 'NE15' 'SO11' 'RM02' 'GL06' 'SO01' 'CE12' 'EA21' 'AL01' 'SO08' 'CE62' 'SW11' 'WE03' 'GL08' 'SO61' 'WE12' 'WE02' 'SO13' 'WE66' 'WE06' 'NW05' 'NE19' 'WE08' 'SW10' 'WE01' 'WE64' 'EA22' 'SW03' 'AL62' 'RM06' 'GL10' 'SO02' 'SO03' 'SW02' 'EA14' 'AL61' 'SW04' 'RM05' 'SO63' 'SW01' 'EA03' 'NW01' 'EA11' 'EA10' 'RM04' 'EA09' 'SW12' 'EA62' 'WE04' 'EA18' 'SW05' 'EA17' 'UNKN' 'PC61' 'RM01' 'EA01' 'WE65' 'SO04' 'WP04' 'NM61' 'WP01' 'WP64' 'NM02' 'NM01' 'WP09' 'WP08' 'NM66' 'NM03' 'NM67' 'WP03' 'GL61' 'WP11' 'WP66' 'WP02' 'WP61' 'WP12' 'WP65' 'WP06' 'WP67' 'CE64' 'EA61' 'SW61' 'NM64' 'NM65' 'SO64' 'CE65' 'SW62' 'NM62' 'GL64' 'NM63' 'CE61' 'SW64' 'SW65' nan 'GL66' 'GL63' 'WP33' 'NM07' 'SO65' 'WP05' 'SO66' 'GL65' 'WP14' 'SO67' 'SW63' 'SW68' 'EU01' 'NE61' 'WP13' 'SW67' 'NE63' 'WP07' 'SW66' 'NE65' 'GL31' 'EA00' 'WP10' 'EU00' 'EA63' 'SW00' 'NM05' 'AL31' 'WP31' 'SO00' 'NE03' 'SO19' 'CE07' 'GL27' 'SO15' 'WP27' 'SW17' 'WP25' 'WP17' 'SO53' 'EA13' 'GL09' 'NE05' 'NE01' 'EA38' 'WP15' 'GL15' 'GL11' 'GL23' 'SW33' 'GL34' 'AL63' 'ASF1' 'SW41' 'CE33' 'AL79' 'EA31' 'NM60' 'CE63' 'EA07' 'CE05' 'EA25' 'AL03' 'EA27' 'AL05' 'GL25' 'EA23' 'EA05' 'CE03' 'WP23' 'SW15' 'SW13' 'CE09' 'GL17' 'WP21' 'NM09' 'CE01' 'SO21' 'WP19' 'NM13' 'GL21' 'SO16' 'EA15' 'SO06' 'EA19' 'SO17' 'NM04' 'WP00' 'NM00' 'EA29' 'NM08' 'SO14' 'WP28' 'NE02' 'GL01' 'CE45' 'EU51' 'SW19' 'CE71' 'NM06' 'NM10' 'SO80' 'WA45' 'NM11' 'GL00' 'SW21' 'EU03' 'WP29' 'SO25' 'EA35' 'SO33' 'SO27' 'SO23' 'AL00' 'EU70' 'SO35' 'EA33' 'SW23' 'CE02' 'SW27' 'CE13' 'SW29' 'EA68' 'SO29' 'EA65' 'SO37' 'SW25' 'CE17' 'SW31' 'EA39' 'CE19' 'CE21' 'NM21' 'NM19' 'WP30' 'SO39' 'WP35' 'CE25' 'AL07' 'WP37' 'GL33' 'CE23' 'AL09' 'WP39' 'SW39' 'FS58' 'FS57' 'FS59' 'GL62' 'FS56'] [+] c13 State of the accident/incident location. VALUES ['MO' 'IL' 'TN' 'IN' 'OR' 'OK' 'WA' 'OH' 'FL' 'WI' 'MI' 'AZ' 'CA' 'SC' 'TX' 'IA' 'CO' 'MN' 'XX' 'MA' 'AR' 'KS' 'UT' 'VA' 'LA' 'NH' 'NC' 'WY' 'GA' 'NE' 'MD' 'AK' 'KY' 'ID' 'CT' 'NM' 'NV' 'SD' 'AL' 'PA' 'MT' 'NY' 'VT' 'ND' 'DE' 'DC' 'NJ' 'RQ' 'HI' 'ME' 'MS' 'RI' 'WV' 'VQ' 'CN' nan 'GQ' 'BF' 'IR' 'JA' 'DR' 'RP' 'PN' 'CM' 'AQ' 'UK' 'MX' 'TT' 'GE' 'CH' 'EI' 'HA' 'VE' 'IT' 'MY' 'AV' 'GC' 'TC' 'CU' 'JM' 'BL' 'PO' 'NU' 'VM' 'TQ' 'BE' 'NL' 'SW' 'KO' 'BR' 'NR' 'IC' 'FR' 'SA' 'CS' 'TK' 'DO' 'GT' 'YE' 'PR' 'WN' 'PQ' 'II' 'GL' 'MB' 'KE' 'AO' 'SF' 'VI' 'BH' 'YO' 'BD' 'MQ' 'PE' 'BA' 'CQ' 'HK' 'NZ' 'VC' 'CP' 'AC' 'GY' 'HO' 'SK' 'FT' 'PG' 'AS' 'NT' 'TO' 'GR' 'KU' 'PS' 'KQ' 'IE' 'AY' 'TD' 'FM' 'WQ' 'SP' 'PC' 'NG' 'BC' 'AJ' 'EC' 'AG' 'SG' 'BP' 'MP' 'PL' 'EU' 'NX' 'ES' 'SN' 'SX' 'TH' 'LI' 'FJ' 'ST' 'AA' 'IS' 'DA' 'US' 'QA' 'RM' 'TG' 'ZZ' 'OC' 'ON' 'GU' 'AB' 'NS'] [+] c14 City of the accident/incident location. VALUES ['RICHMOND' 'OLNEY' 'MONTEREY' ... 'DREXEL HILL' 'RAINBOW' 'ADAIR'] [+] c20 Latitude coordinates of the accident/incident VALUES [999. 42.16666667 51.33333333 ... 67.45 24.88333333 20.23333333] [+] c21 Longitude coordinates of the accident/incident VALUES [ 999. -120.4 -37.5 ... -12.4 97.71666667 -140.23333333] [+] c106 Primary flying condition code VALUES ['B' 'A' 'C' nan '19540228.0' '19620714.0' '19760912.0' '19700202.0' '19680726.0' '19691204.0' '19641127.0' '19440916.0' '19560306.0' '19550927.0' '19961002.0' '19670626.0' '19930126.0' '19571026.0' '19691006.0'] [+] c108 Secondary flying condition code VALUES ['S' 'D' 'U' 'M' 'G' 'N' nan 'L' 'T' 'H' 'F' 'I' 'O' 'W' 'J' 'V' 'A' 'P' 'B' 'C' 'R' 'K' 'E' 'Q' '19890602.0'] [+] c110 Light condition code VALUES [ 4. 1. 3. 2. 5. nan] [+] c112 Sky condition code VALUES [nan 'BRKN' 'SCAT' 'CLER' 'OBSC' 'OCST'] [+] c113 Cloud ceiling VALUES [nan '600' '200' '400' 'CLER' '800' '7000' '4500' '3500' '1000' '2500' '500' '2000' '5000' '3000' 'GRIK' '2300' '1800' '6000' '9999' '2900' '3100' '900' '4000' '2200' '9000' '8000' '1721' '1200' '5500' '100' '700' '6400' '1500' '1900' '4900' '2400' '3400' '1300' '1700' '7500' '3200' '2100' '$)))' '3300' 'GRIL' '2700' 'GEIK' '4600' 'G10' '1600' '6500' 'SCAT' '2600' '1400' '300' '2800' '3800' '4400' '6' '9500' '2020' '1100' '4300' '3700' '8500' '7' '0700' 'BRKN' 'ZERO' '3600' '0' 'GR1K' '2' '1340' 'OCS' '850' '150' '0600' 'OVCT' '3900' '?' '.' '2280' '5600' '38' '10,0' '1' '4100' '5' '7200' '250' '8' '1032' '25' '4700' '75' '85' '50' '125' '4002' 'GRI' 'OCST' 'F' '40' '4200' '650' '0500' '10' '4800' '1350' 'GIK' 'GROL' 'GERI' '0400' '0100' '00' 'OBSC' '800L' '7800' '5005' '330G' '8400' '5002' 'GRKG' '0300' '0200' '9200' '0800' '900G' '8001' '200L' '1040' '0900' '9-99' '9997' '0000' '0850' '5800' '0004' '0740' '0350' '6600' '0003' '7003' '0010' '7600' '0050' '0660' '4003' '5200' '2003' '0450' '9007' '0006' '9003' '3001' '4001' '0150' '0060' '4566' '7002' '0250' '0045' '0009' '8200' '0005' '0020' '4350' '0075' '8003' '0750' '0007' '0001' '2504' '0804' '0999' '0040' '3380' '7400' '0002' '9025' '6100' '5400' '20' '350' '3404' '4509' '6002' '450' '3' '70' '5100' '0008' '0070' '0015' '5300' '0011' '0013' '0017' '0023' '0032' '0035' '0055' '0016' '0030' '0080' '0022' '0025' '0046' '0012' '0036' '0028' '0029' '0085' '0095' '0031' '0110' '0024' '0140' '0037' '0090' '0033' '0018' '0220' '0034' '0026' '0038' '0044' '5001' '0041' '0049' '0014' '0047' '0021' '0043' '0042' '2599' '0019' '0048' '0065' '0160' '0027' '3204' '0039' '0120' '5.0' '11.0' '15.0' '40.0' '10.0' '25.0' '50.0' '1.0' '30.0' '16.0' '6.0' '4.0' '9.0' '3.0' '41.0' '21.0' '47.0' '8.0' '20.0' '12.0' '60.0' '2.0' '35.0' '19.0' '7.0' '80.0' '18.0' '28.0' '27.0'] [+] c114 Visibility code VALUES [nan '5' 'G10' '10' 'LS1' '1' '3' '4' '6' '7' '2' '8' '9' 'GR3'] [+] c115 Visibility restriction code. VALUES [nan 'FOG' 'RAI' 'HAZ' 'SNO' 'DRZ' 'BLS' 'DST' 'SMK'] [+] c240 Wind direction VALUES [nan '21' '05' '11' '0' '13' '30' '15' '09' '04' '36' '17' '26' '08' '31' '27' '33' '12' '23' '01' '32' '18' '25' '16' '07' '02' '35' '28' '34' '29' '24' '03' '14' '22' '06' '20' '10' '19' '00' '60' '39' '50' 'G3' 'G1' '55' '40' 'G4' 'C' '1' '3' '6' '5' '8' '47' '4' '45' '7' '38' '2' '90' '9' 'G2' 'G' '.9' 'OO' '37' 'N' '70' 'O' 'G6' '80' 'G5' '89' '66' '@@' 'G0' '75' '87' '24.0' '28.0' '5.0' '30.0' '33.0' '36.0' '11.0' '2.0' '22.0' '18.0' '20.0' '23.0' '15.0' '14.0' '29.0' '10.0' '35.0' '21.0' '13.0' '19.0' '12.0' '1.0' '27.0' '16.0' '9.0' '4.0' '34.0' '31.0' '7.0' '25.0' '17.0' '8.0' '32.0' '3.0' '6.0' '26.0' '0.0' '1B' '3N'] [+] c241 Wind speed in miles per hours VALUES [nan '15' '05' '07' '03' '10' '08' '06' '04' '09' '18' '12' '11' '24' '14' '22' '13' '25' '20' '19' '16' '17' '01' '02' '23' '8' '29' '26' '28' '44' '21' '52' '40' 'G1' 'G0' '2' 'G3' '90' 'G2' '30' '0' '00' 'G' '27' '34' '9' '31' 'S' 'B' '35' '4' '7' '5' '36' '6' '41' '55' '3' 'T' 'G4' 'G6' '60' 'X' '33' '50' '1' 'W' '43' '32' '80' '39' '99' '38' '51' '53' ')' '45' '0G' '46' '70' '56' '54' '58' 'G5' '49' '87' '66' '57' '37' '47' '48' '69' '9.0' '8.0' '5.0' '15.0' '6.0' '10.0' '0.0' '12.0' '7.0' '14.0' '3.0' '40.0' '2.0' '11.0' '25.0' '13.0' '16.0' '26.0' '21.0' '24.0' '4.0' '65.0' '23.0' '18.0' '20.0' '17.0' '35.0' '22.0' '28.0' '1.0' '30.0' '33.0' '19.0' '37.0' '32.0' '27.0' '57.0' '45.0' '31.0' '38.0' '87.0' '29.0' '84.0' '46.0' '58.0' '50.0' '36.0' '51.0' '75.0' '70.0' '92.0' '74.0' '80.0' '60.0' '34.0' '55.0' '42.0'] [+] c242 Gust indicator flag VALUES [nan 'G' '5' '0' '2' '8' '4' '3' '1' '6' 'B' 'A' '7'] [+] c243 Gust speed in miles per hour VALUES [nan '32' '16' '15' '12' '29' '40' '22' '27' '35' '18' '10' '30' '25' '31' '14' '20' '19' '28' '24' '17' '38' '36' '23' '26' '2' '34' '33' '52' '43' '21' '45' '53' '58' '05' '08' '06' '1' '0' '4' 'C' '42' '48' 'B' '37' '39' '7' '13' '60' '55' '07' '50' '44' 'G' 'S' 'L' '03' '02' '8' '04' '85' '09' '65' '80' '11' '5' '46' '70' '00' '90' '47' '75' '51' '41' '01' '77' '18.0' '25.0' '30.0' '15.0' '12.0' '20.0' '40.0' '22.0' '23.0' '35.0' '24.0' '10.0' '32.0' '28.0' '38.0' '19.0' '21.0' '41.0' '33.0' '27.0' '29.0' '55.0' '68.0' '45.0' '42.0' '36.0' '58.0' '37.0' '13.0' '7.0' '2.0' '17.0' '60.0' '8.0' '44.0' '39.0' '34.0' '16.0' '6.0' '26.0' '50.0' '11.0' '14.0' '46.0' '5.0' '9.0' '4.0' '31.0' '52.0' '48.0' '47.0' '70.0' '43.0' '3.0' '80.0' '54.0' '0.0' '65.0' '1.0' '62.0' '57.0' '49.0' '51.0' '56.0' '99.0'] [+] c41 Certificate type code of the pilot in command VALUES ['03' '01' '09' 'XX' '99' 'TT' 'F3' 'F9' nan 'F1' '00' '1' '3' '9' '0' '02' 'F' '04' 'F4' 'F2'] [+] c49 Qualification code of the pilot in command . VALUES ['E' 'G' 'H' 'M' nan 'F' 'R' 'N' 'P' 'S' 'T' 'U' 'W' 'V' 'X'] [+] c56 Total number of hours the pilot has flown. VALUES [ 1187. 364. 205. ... 6537. 15179. 11291.] [+] c96 Phase of flight code VALUES ['IB' 'DD' 'HB' 'IA' 'BD' 'BI' 'IH' 'DE' 'DA' 'BC' 'KE' 'IE' 'BG' 'BF' 'KC' 'HA' 'GX' 'IC' 'EX' 'FE' 'BE' 'JX' 'AD' 'KG' 'KD' 'BA' 'BH' 'AB' 'AA' 'HD' 'KB' 'KF' 'IF' 'AC' 'HC' 'KA' 'ID' 'FD' 'IG' 'FF' 'CL' 'DF' 'FG' 'BB' 'HH' 'HF' 'DB' 'FA' 'FB' 'DG' nan 'HG' 'FC' 'HE' 'DH' 'PJ' 'HM' 'SR' 'HK' 'PL' 'LD' 'TF' 'CR' 'MN' 'AP' 'TX' 'GR' 'OT' 'TR' 'UK' 'LA' 'TC' 'LR' 'HR' 'TD' 'LT' 'RU']
Como se puede observar, existe la mitad de columnas que contiene más del 50% de datos nulos o vacíos que habrá que tener en cuenta a la hora de la selección de caracteristicas para nuestro estudio. Además de valores atípicos para su columna por encontrarse desplazados.
Se observa que las clases de estudio (Accidente o Incidente) están balanceadas, así como el lugar de los sucesos que se distribuye por gran parte de la geografía mundial. En resumen, se puede decir que se está con un conjunto de datos balanceado.
Existe el atributo c32 que hace referencia al año de construcción del avión, pero todos sus valores son * o bien NaN. Esta característica se había escogido junto con el año del suceso para conocer la antiguedad del avión en el momento del suceso.Por lo tanto, dados los no valores de dicha columna, se deberá desestimar del conjunto de datos ambos atributos.
El atributo c242 contiene datos que no son claros sin más indicaciones que las que aporta la información del dataset. Y por este motivo se eliminará dicha columna del conjunto de datos, además, a nivel conceptual en el dataset, para las rachas de viento existe un segundo atributo, el c243, por lo que dicha caracteristica del incidente, se seguirá pudiendo analizar.
Por último, analizando el tipo de datos para el dataframe, se observa que la gran mayoría de atributos hace referencia a tipo objeto. Se debe a que determinados de estos atributos, tiene datos categóricos que mezcla numeros y letras en sus datos. No obstante, se analiza más adelante cuál de estas son realmente este tipo de columnas y cuáles son fruto de un error en los datos.
Con las conclusiones extraídas en el apartado anterior, en este apartado se realiza las correccciones necesarias para corregir estos errores.
Como se ha mencionado, la caracteristica c32 no tiene los valores que se esperaba, así que se elimina junto a c6. Así como c242 por no aportar datos claros.
# Eliminar columnas indeseadas
df = df.drop(columns=['c6', 'c32', 'c242'])
A continuación, se define los valores que ha de tener cada una de las columnas categóricas para encontrar en que otras columnas puede existir y asi poder relocalizarlas. Esta busqueda se realiza sobre todo el dataset, de este modo, se podrá importar esos valores a las columnas del dataset de estudio.
# Type of Event
c1 = ['A', 'I']
# Region of the accident/incident location
c11 = ['AC','AG','AL','CE','EA','EU','FS','GL','NE','NM','SD','SO','SW','WA','WP']
# District office of the accident/incident location
c12 = ['ACRC','AC70','AC74','AC75','AC78','AC90','AGC1','ALRC','AL00','AL01','AL03','AL05','AL70','AL72','AL78','AL79',
'AL80','AL90','AL91','AUS','CA','CERC','CE00','CE01','CE03','CE05','CE07','CE09','CE40','CE42','CE43','CE44',
'CE45','CE46','CE47','CE48','CE49','CE50','CE51','CE52','CE53','CE54','CE55','CE56','CE57','CE58','CE59','CE70',
'CE71','CE72','CE73','CE78','CE79','CE80','CE90','CE91','EARC','EA00','EA01','EA03','EA05','EA07','EA09','EA11',
'EA13','EA15','EA16','EA17','EA19','EA21','EA23','EA25','EA27','EA29','EA31','EA33','EA35','EA70','EA71','EA72',
'EA73','EA74','EA78','EA79','EA80','EA90','EA91','EU00','EU01','EU02','EU03','EU70','EU72','EU73','EU74','EU76',
'FS01','GLRC','GL00','GL01','GL03','GL05','GL07','GL09','GL10','GL11','GL13','GL15','GL17','GL19','GL21','GL23',
'GL25','GL27','GL31','GL70','GL71','GL72','GL73','GL74','GL75','GL76','GL78','GL79','GL80','GL90','GL91','NERC',
'NE00','NE01','NE02','NE03','NE05','NE40','NE41','NE42','NE43','NE44','NE45','NE46','NE47','NE48','NE49','NE56',
'NE57','NE58','NE70','NE71','NE78','NE79','NE80','NE90','NE91','NMRC','NM00','NM01','NM02','NM03','NM04','NM05',
'NM06','NM07','NM08','NM09','NM10','NM11','NM13','NM40','NM41','NM42','NM43','NM46','NM47','NM48','NM50','NM51',
'NM56','NM57','NM58','NM70','NM71','NM72','NM73','NM78','NM79','NM80','NM81','NM82','NM83','NM90','NM91','SD00',
'SORC','SO00','SO01','SO03','SO05','SO06','SO07','SO08','SO09','SO11','SO13','SO14','SO15','SO16','SO17','SO19',
'SO21','SO23','SO25','SO27','SO31','SO33','SO64','SO70','SO71','SO72','SO73','SO74','SO75','SO76','SO77','SO78',
'SO79','SO80','SO90','SO91','SO92','SO93','SO94','SO95','SWRC','SW00','SW01','SW02','SW03','SW05','SW07','SW09',
'SW11','SW13','SW15','SW17','SW19','SW21','SW40','SW41','SW42','SW43','SW56','SW57','SW70','SW71','SW72','SW73',
'SW78','SW79','SW80','SW84','SW90','SW91','SW99','WA45','WA70','WA71','WA72','WA73','WA91','WPRC','WP00','WP01',
'WP02','WP03','WP05','WP06','WP07','WP09','WP11','WP13','WP15','WP17','WP19','WP21','WP23','WP25','WP27','WP28',
'WP29','WP30','WP31','WP33','WP70','WP71','WP72','WP73','WP74','WP75','WP76','WP77','WP78','WP79','WP80','WP81',
'WP82','WP83','WP90','WP91']
# State of the accident/incident location
c13 = ['AA','AB','AC','AF','AG','AI','AJ','AK','AL','AM','AN','AO','AQ','AR','AS','AT','AU',
'AV','AY','AZ','BA','BB','BC','BD','BE','BF','BG','BH','BK','BL','BM','BN','BO','BP',
'BQ','BR','BS','BT','BU','BV','BX','BY','CA','CB','CC','CD','CE','CF','CG','CH','CI',
'CJ','CK','CL','CM','CN','CO','CP','CQ','CR','CS','CT','CU','CV','CW','CY','CZ','DA',
'DC','DE','DJ','DM','DO','DQ','DR','EC','EG','EI','EK','EN','EQ','ER','ES','ET','EU',
'EZ','FA','FG','FI','FJ','FK','FL','FM','FO','FP','FQ','FR','FS','FT','GA','GB','GC',
'GE','GG','GH','GI','GJ','GK','GL','GM','GN','GO','GP','GQ','GR','GT','GV','GY','GZ',
'HA','HI','HK','HM','HO','HQ','HR','HU','IA','IC','ID','IE','II','IL','IM','IN','IO',
'IP','IR','IS','IT','IV','IZ','JA','JE','JM','JN','JO','JQ','JU','KE','KG','KN','KO',
'KQ','KR','KS','KT','KU','KY','KZ','LA','LE','LG','LH','LI','LO','LQ','LS','LT','LU',
'LY','L0','MA','MB','MC','MD','ME','MF','MG','MH','MI','MJ','MK','ML','MM','MN','MO',
'MP','MQ','MR','MS','MT','MU','MV','MW','MX','MY','MZ','M1','M2','M3','M4','NA','NB',
'NC','ND','NE','NF','NG','NH','NI','NJ','NL','NM','NO','NP','NR','NS','NT','NU','NV',
'NW','NX','NY','NZ','OH','OK','OR','PA','PC','PE','PF','PG','PK','PL','PN','PO','PP',
'PQ','PR','PS','PT','PU','QA','RE','RH','RI','RM','RO','RP','RQ','RS','RW','SA','SB',
'SC','SD','SE','SF','SG','SH','SI','SK','SL','SM','SN','SO','SP','SR','SS','ST','SU',
'SV','SW','SX','SY','SZ','S1','S2','TC','TD','TE','TG','TH','TI','TK','TL','TM','TN',
'TO','TP','TS','TU','TV','TW','TX','TZ','UG','UK','UP','UR','US','UT','UV','UY','UZ',
'VA','VC','VE','VI','VM','VQ','VT','WA','WF','WI','WN','WQ','WS','WV','WY','WZ','XX',
'YE','YM','YO','YS','ZA','ZI']
# Airworthiness class code of the aircraft
c30 = ['STRD', 'LIMT', 'REST', 'EXPT', 'PROV', 'MULT', 'FERY']
# Certificate type code of the pilot in command
c41 = ['F1', 'F3', 'F9', 'TT', 'XX', '00', '01', '02', '03', '09']
# Qualification code of the pilot in command
c49 = ['E', 'F', 'G', 'H','M', 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X']
# Phase of flight code
c96 = ['AA','AB','AC','AD','BA','BB','BC','BD','BE','BF','BG','BH','BI','CX',
'DA','DB','DC','DD','DE','DF','DG','DH','EX','FA','FB','FC','FD','FE',
'FF','FG','GX','HA','HB','HC','HD','HE','HF','HG','HH','HK','HM','IA',
'IB','IC','ID','IE','IF','IG','IH','JX','KA','KB','KC','KD','KE','KF',
'KG','PJ','PL','SR']
# Primary flying condition code
c106 = ['A', 'B', 'C']
# Secondary flying condition code
c108 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W']
# Light condition code
c110 = ['1', '2', '3', '4', '5']
# Sky condition code
c112 = ['BRKN', 'CLER', 'OBSC', 'OCST', 'SCAT']
# Visibility code
c114 = ['G10', 'LS1', '1', '2' ,'3' ,'4' ,'5' ,'6' ,'7' ,'8' ,'9' ,'10']
# Visibility restriction code
c115 = ['BLS', 'DRZ', 'DST', 'FOG', 'HAZ', 'RAI', 'SMK', 'SNO']
# Lista de columnas categóricas con tuplas que contienen la lista de valores y nombre de la columna
cat_cols = [('c1', c1), ('c11', c11), ('c12', c12), ('c13', c13), ('c30', c30), ('c41', c41),
('c49', c49), ('c96', c96), ('c106', c106), ('c108', c108), ('c110', c110),
('c112', c112), ('c114', c114), ('c115', c115)]
# Buscar columnas que coincida con valores que son de la columna categótrica del modelo de estudio
match_cols = {}
for index, data in enumerate(cat_cols):
# Crear un dataframa con valores True/False para aquellos que hay coincidencia
df_match_values = data_df.isin(data[1])
# Crear un dataframe de dos columnas que indica para columna si hay coincidencia o no
match = df_match_values.any()
print(f'[+] COLUMN_DATA\t{data[0]}')
print(f'COLUMN_VALUES\t{data[1]}')
# Devolver el indice del match que es True, es decir, que tiene coincidencia
match = list(match[match == True].index)
match_cols.update({data[0]: match})
print(f'MATCH_COLUMNS\t{match}', end='\n\n')
[+] COLUMN_DATA c1 COLUMN_VALUES ['A', 'I'] MATCH_COLUMNS ['c1', 'c132', 'c134', 'c136', 'c138', 'c155', 'c102', 'c104', 'c106', 'c108', 'c242', 'c52', 'c80', 'c82', 'c84', 'c88', 'c90', 'c92', 'c100', 'c184', 'c192'] [+] COLUMN_DATA c11 COLUMN_VALUES ['AC', 'AG', 'AL', 'CE', 'EA', 'EU', 'FS', 'GL', 'NE', 'NM', 'SD', 'SO', 'SW', 'WA', 'WP'] MATCH_COLUMNS ['c157', 'c27', 'c38', 'c11', 'c13', 'c57', 'c59', 'c121', 'c127', 'c78', 'c86', 'c94', 'c96'] [+] COLUMN_DATA c12 COLUMN_VALUES ['ACRC', 'AC70', 'AC74', 'AC75', 'AC78', 'AC90', 'AGC1', 'ALRC', 'AL00', 'AL01', 'AL03', 'AL05', 'AL70', 'AL72', 'AL78', 'AL79', 'AL80', 'AL90', 'AL91', 'AUS', 'CA', 'CERC', 'CE00', 'CE01', 'CE03', 'CE05', 'CE07', 'CE09', 'CE40', 'CE42', 'CE43', 'CE44', 'CE45', 'CE46', 'CE47', 'CE48', 'CE49', 'CE50', 'CE51', 'CE52', 'CE53', 'CE54', 'CE55', 'CE56', 'CE57', 'CE58', 'CE59', 'CE70', 'CE71', 'CE72', 'CE73', 'CE78', 'CE79', 'CE80', 'CE90', 'CE91', 'EARC', 'EA00', 'EA01', 'EA03', 'EA05', 'EA07', 'EA09', 'EA11', 'EA13', 'EA15', 'EA16', 'EA17', 'EA19', 'EA21', 'EA23', 'EA25', 'EA27', 'EA29', 'EA31', 'EA33', 'EA35', 'EA70', 'EA71', 'EA72', 'EA73', 'EA74', 'EA78', 'EA79', 'EA80', 'EA90', 'EA91', 'EU00', 'EU01', 'EU02', 'EU03', 'EU70', 'EU72', 'EU73', 'EU74', 'EU76', 'FS01', 'GLRC', 'GL00', 'GL01', 'GL03', 'GL05', 'GL07', 'GL09', 'GL10', 'GL11', 'GL13', 'GL15', 'GL17', 'GL19', 'GL21', 'GL23', 'GL25', 'GL27', 'GL31', 'GL70', 'GL71', 'GL72', 'GL73', 'GL74', 'GL75', 'GL76', 'GL78', 'GL79', 'GL80', 'GL90', 'GL91', 'NERC', 'NE00', 'NE01', 'NE02', 'NE03', 'NE05', 'NE40', 'NE41', 'NE42', 'NE43', 'NE44', 'NE45', 'NE46', 'NE47', 'NE48', 'NE49', 'NE56', 'NE57', 'NE58', 'NE70', 'NE71', 'NE78', 'NE79', 'NE80', 'NE90', 'NE91', 'NMRC', 'NM00', 'NM01', 'NM02', 'NM03', 'NM04', 'NM05', 'NM06', 'NM07', 'NM08', 'NM09', 'NM10', 'NM11', 'NM13', 'NM40', 'NM41', 'NM42', 'NM43', 'NM46', 'NM47', 'NM48', 'NM50', 'NM51', 'NM56', 'NM57', 'NM58', 'NM70', 'NM71', 'NM72', 'NM73', 'NM78', 'NM79', 'NM80', 'NM81', 'NM82', 'NM83', 'NM90', 'NM91', 'SD00', 'SORC', 'SO00', 'SO01', 'SO03', 'SO05', 'SO06', 'SO07', 'SO08', 'SO09', 'SO11', 'SO13', 'SO14', 'SO15', 'SO16', 'SO17', 'SO19', 'SO21', 'SO23', 'SO25', 'SO27', 'SO31', 'SO33', 'SO64', 'SO70', 'SO71', 'SO72', 'SO73', 'SO74', 'SO75', 'SO76', 'SO77', 'SO78', 'SO79', 'SO80', 'SO90', 'SO91', 'SO92', 'SO93', 'SO94', 'SO95', 'SWRC', 'SW00', 'SW01', 'SW02', 'SW03', 'SW05', 'SW07', 'SW09', 'SW11', 'SW13', 'SW15', 'SW17', 'SW19', 'SW21', 'SW40', 'SW41', 'SW42', 'SW43', 'SW56', 'SW57', 'SW70', 'SW71', 'SW72', 'SW73', 'SW78', 'SW79', 'SW80', 'SW84', 'SW90', 'SW91', 'SW99', 'WA45', 'WA70', 'WA71', 'WA72', 'WA73', 'WA91', 'WPRC', 'WP00', 'WP01', 'WP02', 'WP03', 'WP05', 'WP06', 'WP07', 'WP09', 'WP11', 'WP13', 'WP15', 'WP17', 'WP19', 'WP21', 'WP23', 'WP25', 'WP27', 'WP28', 'WP29', 'WP30', 'WP31', 'WP33', 'WP70', 'WP71', 'WP72', 'WP73', 'WP74', 'WP75', 'WP76', 'WP77', 'WP78', 'WP79', 'WP80', 'WP81', 'WP82', 'WP83', 'WP90', 'WP91'] MATCH_COLUMNS ['c12', 'c13', 'c58', 'c59', 'c121', 'c122', 'c128', 'c94'] [+] COLUMN_DATA c13 COLUMN_VALUES ['AA', 'AB', 'AC', 'AF', 'AG', 'AI', 'AJ', 'AK', 'AL', 'AM', 'AN', 'AO', 'AQ', 'AR', 'AS', 'AT', 'AU', 'AV', 'AY', 'AZ', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BK', 'BL', 'BM', 'BN', 'BO', 'BP', 'BQ', 'BR', 'BS', 'BT', 'BU', 'BV', 'BX', 'BY', 'CA', 'CB', 'CC', 'CD', 'CE', 'CF', 'CG', 'CH', 'CI', 'CJ', 'CK', 'CL', 'CM', 'CN', 'CO', 'CP', 'CQ', 'CR', 'CS', 'CT', 'CU', 'CV', 'CW', 'CY', 'CZ', 'DA', 'DC', 'DE', 'DJ', 'DM', 'DO', 'DQ', 'DR', 'EC', 'EG', 'EI', 'EK', 'EN', 'EQ', 'ER', 'ES', 'ET', 'EU', 'EZ', 'FA', 'FG', 'FI', 'FJ', 'FK', 'FL', 'FM', 'FO', 'FP', 'FQ', 'FR', 'FS', 'FT', 'GA', 'GB', 'GC', 'GE', 'GG', 'GH', 'GI', 'GJ', 'GK', 'GL', 'GM', 'GN', 'GO', 'GP', 'GQ', 'GR', 'GT', 'GV', 'GY', 'GZ', 'HA', 'HI', 'HK', 'HM', 'HO', 'HQ', 'HR', 'HU', 'IA', 'IC', 'ID', 'IE', 'II', 'IL', 'IM', 'IN', 'IO', 'IP', 'IR', 'IS', 'IT', 'IV', 'IZ', 'JA', 'JE', 'JM', 'JN', 'JO', 'JQ', 'JU', 'KE', 'KG', 'KN', 'KO', 'KQ', 'KR', 'KS', 'KT', 'KU', 'KY', 'KZ', 'LA', 'LE', 'LG', 'LH', 'LI', 'LO', 'LQ', 'LS', 'LT', 'LU', 'LY', 'L0', 'MA', 'MB', 'MC', 'MD', 'ME', 'MF', 'MG', 'MH', 'MI', 'MJ', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'M1', 'M2', 'M3', 'M4', 'NA', 'NB', 'NC', 'ND', 'NE', 'NF', 'NG', 'NH', 'NI', 'NJ', 'NL', 'NM', 'NO', 'NP', 'NR', 'NS', 'NT', 'NU', 'NV', 'NW', 'NX', 'NY', 'NZ', 'OH', 'OK', 'OR', 'PA', 'PC', 'PE', 'PF', 'PG', 'PK', 'PL', 'PN', 'PO', 'PP', 'PQ', 'PR', 'PS', 'PT', 'PU', 'QA', 'RE', 'RH', 'RI', 'RM', 'RO', 'RP', 'RQ', 'RS', 'RW', 'SA', 'SB', 'SC', 'SD', 'SE', 'SF', 'SG', 'SH', 'SI', 'SK', 'SL', 'SM', 'SN', 'SO', 'SP', 'SR', 'SS', 'ST', 'SU', 'SV', 'SW', 'SX', 'SY', 'SZ', 'S1', 'S2', 'TC', 'TD', 'TE', 'TG', 'TH', 'TI', 'TK', 'TL', 'TM', 'TN', 'TO', 'TP', 'TS', 'TU', 'TV', 'TW', 'TX', 'TZ', 'UG', 'UK', 'UP', 'UR', 'US', 'UT', 'UV', 'UY', 'UZ', 'VA', 'VC', 'VE', 'VI', 'VM', 'VQ', 'VT', 'WA', 'WF', 'WI', 'WN', 'WQ', 'WS', 'WV', 'WY', 'WZ', 'XX', 'YE', 'YM', 'YO', 'YS', 'ZA', 'ZI'] MATCH_COLUMNS ['c144', 'c157', 'c160', 'c162', 'c27', 'c38', 'c11', 'c13', 'c118', 'c41', 'c57', 'c59', 'c121', 'c127', 'c78', 'c86', 'c94', 'c96'] [+] COLUMN_DATA c30 COLUMN_VALUES ['STRD', 'LIMT', 'REST', 'EXPT', 'PROV', 'MULT', 'FERY'] MATCH_COLUMNS ['c30'] [+] COLUMN_DATA c41 COLUMN_VALUES ['F1', 'F3', 'F9', 'TT', 'XX', '00', '01', '02', '03', '09'] MATCH_COLUMNS ['c13', 'c240', 'c241', 'c243', 'c41', 'c59', 'c121', 'c127'] [+] COLUMN_DATA c49 COLUMN_VALUES ['E', 'F', 'G', 'H', 'M', 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X'] MATCH_COLUMNS ['c132', 'c134', 'c136', 'c138', 'c145', 'c147', 'c155', 'c233', 'c234', 'c102', 'c104', 'c108', 'c242', 'c49', 'c52', 'c80', 'c82', 'c84', 'c88', 'c90', 'c92', 'c98', 'c123', 'c184', 'c192'] [+] COLUMN_DATA c96 COLUMN_VALUES ['AA', 'AB', 'AC', 'AD', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'CX', 'DA', 'DB', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'EX', 'FA', 'FB', 'FC', 'FD', 'FE', 'FF', 'FG', 'GX', 'HA', 'HB', 'HC', 'HD', 'HE', 'HF', 'HG', 'HH', 'HK', 'HM', 'IA', 'IB', 'IC', 'ID', 'IE', 'IF', 'IG', 'IH', 'JX', 'KA', 'KB', 'KC', 'KD', 'KE', 'KF', 'KG', 'PJ', 'PL', 'SR'] MATCH_COLUMNS ['c13', 'c59', 'c121', 'c78', 'c86', 'c94', 'c96'] [+] COLUMN_DATA c106 COLUMN_VALUES ['A', 'B', 'C'] MATCH_COLUMNS ['c1', 'c132', 'c134', 'c136', 'c138', 'c147', 'c155', 'c102', 'c104', 'c106', 'c108', 'c242', 'c52', 'c80', 'c82', 'c84', 'c88', 'c90', 'c92', 'c100', 'c184', 'c192'] [+] COLUMN_DATA c108 COLUMN_VALUES ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W'] MATCH_COLUMNS ['c1', 'c132', 'c134', 'c136', 'c138', 'c145', 'c147', 'c155', 'c233', 'c234', 'c102', 'c104', 'c106', 'c108', 'c242', 'c49', 'c52', 'c80', 'c82', 'c84', 'c88', 'c90', 'c92', 'c98', 'c100', 'c123', 'c184', 'c192'] [+] COLUMN_DATA c110 COLUMN_VALUES ['1', '2', '3', '4', '5'] MATCH_COLUMNS ['c10', 'c145', 'c153', 'c155', 'c24', 'c25', 'c102', 'c104', 'c113', 'c114', 'c242', 'c41', 'c126', 'c80', 'c88', 'c100', 'c244'] [+] COLUMN_DATA c112 COLUMN_VALUES ['BRKN', 'CLER', 'OBSC', 'OCST', 'SCAT'] MATCH_COLUMNS ['c112', 'c113'] [+] COLUMN_DATA c114 COLUMN_VALUES ['G10', 'LS1', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] MATCH_COLUMNS ['c10', 'c145', 'c149', 'c153', 'c155', 'c24', 'c25', 'c102', 'c104', 'c113', 'c114', 'c240', 'c241', 'c242', 'c243', 'c41', 'c126', 'c80', 'c88', 'c100', 'c244'] [+] COLUMN_DATA c115 COLUMN_VALUES ['BLS', 'DRZ', 'DST', 'FOG', 'HAZ', 'RAI', 'SMK', 'SNO'] MATCH_COLUMNS ['c115']
Como se observa, existe para varias de las columnas, valores que a su vez están en otras columnas del dataset. A continuación se detalla y analiza si esos valores deberán ser devueltos a su columna de origen. Para ello, se realiza una comparación de todo el dataset original con las columnas categoricas que se ha preseleccionado.
Para esta comparación, se tiene en cuenta que los valores coincidentes no son los de un estado ya que estos, existen en otras columnas y son un elevado numero de datos para poder monitorizarlos.
# Existe unos códigos de estados los cuáles son compartidos por varias columnas
states = c13
def moved_values(cat_column: str, conflict_col: list):
# Obtener los valores de la columna categórica
cat_values = [col[1] for col in cat_cols if col[0] == cat_column][0]
print(f"CAT_COLUMN\t{cat_column}\t{col_name(cat_column)}\nVALUES\t{cat_values}\n")
# Iterar las columnas y mostrar los valores únicos
for col in data_df:
# Continuar si no hay coincidencia de valores o es la misma columna
if col not in conflict_col or col == cat_column:
continue
match_col_values = data_df[col].unique()
# Imprimir las coincidencias
res = [val for val in match_col_values if str(val) not in states]
print(f"[+] {col}\t{col_name(col)}")
# print(f"VALUES\t{match_col_values}")
matches = set(cat_values) & set(res)
if not matches:
print('STATE MATCHES\n')
continue
print(f"COMMON_VALUES\t{matches}\n")
Sobre la columna c1 no se realiza ninguna comporobación ya que sus valores, son comunes a muchas otras columnas y no se podría realizar una recolocación con certeza.
# Columna categorica a comprobar
cat_column = 'c11'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c11 Region of the accident/incident location.
VALUES ['AC', 'AG', 'AL', 'CE', 'EA', 'EU', 'FS', 'GL', 'NE', 'NM', 'SD', 'SO', 'SW', 'WA', 'WP']
[+] c157 Type of Landing Gear code.
STATE MATCHES
[+] c27 Region of certification for the aircraft
COMMON_VALUES {'EA', 'WP'}
[+] c38 Certificating region of the engine
COMMON_VALUES {'EA', 'WP'}
[+] c13 State of the accident/incident location.
STATE MATCHES
[+] c57 Residence region code of the pilot in command
COMMON_VALUES {'EA', 'WP'}
[+] c59 Residence state of the pilot in command
STATE MATCHES
[+] c121 State of the owner / operator
STATE MATCHES
[+] c127 Region of the air operator
COMMON_VALUES {'EA', 'WP'}
[+] c78 Primary cause factor code
STATE MATCHES
[+] c86 Secondary cause factor code
STATE MATCHES
[+] c94 Type of accident code
STATE MATCHES
[+] c96 Phase of flight code
STATE MATCHES
Para la columna de las regiones, existe coincidencias con otras columnas del dataset original que hacen referencia también a las regiones. Es decir, no hay ningún valor movido a columnas que no debiera.
# Columna categorica a comprobar
cat_column = 'c12'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c12 District office of the accident/incident location.
VALUES ['ACRC', 'AC70', 'AC74', 'AC75', 'AC78', 'AC90', 'AGC1', 'ALRC', 'AL00', 'AL01', 'AL03', 'AL05', 'AL70', 'AL72', 'AL78', 'AL79', 'AL80', 'AL90', 'AL91', 'AUS', 'CA', 'CERC', 'CE00', 'CE01', 'CE03', 'CE05', 'CE07', 'CE09', 'CE40', 'CE42', 'CE43', 'CE44', 'CE45', 'CE46', 'CE47', 'CE48', 'CE49', 'CE50', 'CE51', 'CE52', 'CE53', 'CE54', 'CE55', 'CE56', 'CE57', 'CE58', 'CE59', 'CE70', 'CE71', 'CE72', 'CE73', 'CE78', 'CE79', 'CE80', 'CE90', 'CE91', 'EARC', 'EA00', 'EA01', 'EA03', 'EA05', 'EA07', 'EA09', 'EA11', 'EA13', 'EA15', 'EA16', 'EA17', 'EA19', 'EA21', 'EA23', 'EA25', 'EA27', 'EA29', 'EA31', 'EA33', 'EA35', 'EA70', 'EA71', 'EA72', 'EA73', 'EA74', 'EA78', 'EA79', 'EA80', 'EA90', 'EA91', 'EU00', 'EU01', 'EU02', 'EU03', 'EU70', 'EU72', 'EU73', 'EU74', 'EU76', 'FS01', 'GLRC', 'GL00', 'GL01', 'GL03', 'GL05', 'GL07', 'GL09', 'GL10', 'GL11', 'GL13', 'GL15', 'GL17', 'GL19', 'GL21', 'GL23', 'GL25', 'GL27', 'GL31', 'GL70', 'GL71', 'GL72', 'GL73', 'GL74', 'GL75', 'GL76', 'GL78', 'GL79', 'GL80', 'GL90', 'GL91', 'NERC', 'NE00', 'NE01', 'NE02', 'NE03', 'NE05', 'NE40', 'NE41', 'NE42', 'NE43', 'NE44', 'NE45', 'NE46', 'NE47', 'NE48', 'NE49', 'NE56', 'NE57', 'NE58', 'NE70', 'NE71', 'NE78', 'NE79', 'NE80', 'NE90', 'NE91', 'NMRC', 'NM00', 'NM01', 'NM02', 'NM03', 'NM04', 'NM05', 'NM06', 'NM07', 'NM08', 'NM09', 'NM10', 'NM11', 'NM13', 'NM40', 'NM41', 'NM42', 'NM43', 'NM46', 'NM47', 'NM48', 'NM50', 'NM51', 'NM56', 'NM57', 'NM58', 'NM70', 'NM71', 'NM72', 'NM73', 'NM78', 'NM79', 'NM80', 'NM81', 'NM82', 'NM83', 'NM90', 'NM91', 'SD00', 'SORC', 'SO00', 'SO01', 'SO03', 'SO05', 'SO06', 'SO07', 'SO08', 'SO09', 'SO11', 'SO13', 'SO14', 'SO15', 'SO16', 'SO17', 'SO19', 'SO21', 'SO23', 'SO25', 'SO27', 'SO31', 'SO33', 'SO64', 'SO70', 'SO71', 'SO72', 'SO73', 'SO74', 'SO75', 'SO76', 'SO77', 'SO78', 'SO79', 'SO80', 'SO90', 'SO91', 'SO92', 'SO93', 'SO94', 'SO95', 'SWRC', 'SW00', 'SW01', 'SW02', 'SW03', 'SW05', 'SW07', 'SW09', 'SW11', 'SW13', 'SW15', 'SW17', 'SW19', 'SW21', 'SW40', 'SW41', 'SW42', 'SW43', 'SW56', 'SW57', 'SW70', 'SW71', 'SW72', 'SW73', 'SW78', 'SW79', 'SW80', 'SW84', 'SW90', 'SW91', 'SW99', 'WA45', 'WA70', 'WA71', 'WA72', 'WA73', 'WA91', 'WPRC', 'WP00', 'WP01', 'WP02', 'WP03', 'WP05', 'WP06', 'WP07', 'WP09', 'WP11', 'WP13', 'WP15', 'WP17', 'WP19', 'WP21', 'WP23', 'WP25', 'WP27', 'WP28', 'WP29', 'WP30', 'WP31', 'WP33', 'WP70', 'WP71', 'WP72', 'WP73', 'WP74', 'WP75', 'WP76', 'WP77', 'WP78', 'WP79', 'WP80', 'WP81', 'WP82', 'WP83', 'WP90', 'WP91']
[+] c13 State of the accident/incident location.
STATE MATCHES
[+] c58 Residence district office code of the pilot in command
COMMON_VALUES {'WP05', 'NM06', 'SW01', 'GL15', 'WP11', 'GL11', 'NM00', 'EA13', 'AL00', 'SO09', 'NE71', 'EA21', 'WP21', 'EA05', 'EA31', 'SO17', 'GL27', 'EA07', 'CE40', 'GL03', 'SO64', 'EU00', 'SW15', 'SW03', 'SO15', 'EA23', 'GL19', 'SO03', 'SW17', 'NM09', 'NM04', 'SO07', 'EU01', 'GL07', 'SO08', 'WP31', 'EA35', 'WP07', 'GL25', 'NM05', 'SO13', 'WP13', 'WP00', 'SO01', 'SW99', 'SW07', 'SO19', 'EA33', 'NM02', 'EU03', 'GL10', 'SO23', 'WP15', 'WP09', 'NE01', 'AL01', 'WP01', 'AL05', 'EA19', 'EU70', 'WP33', 'CE07', 'SW05', 'EA11', 'EA00', 'SO11', 'SW09', 'SO25', 'GL31', 'GL21', 'NM13', 'CE05', 'NM08', 'NE03', 'SW02', 'SO21', 'WP06', 'EA16', 'CE01', 'SW11', 'NM01', 'EA73', 'SW00', 'GL01', 'SO14', 'SW19', 'SO00', 'EA71', 'EA29', 'EU02', 'WP19', 'WP02', 'NM11', 'WP17', 'SW13', 'NM03', 'CE03', 'SO33', 'SO16', 'GL13', 'CE00', 'WP27', 'WP03', 'NE02', 'CE45', 'WP28', 'GL00', 'NE00', 'SO06', 'WP25', 'NM10', 'GL09', 'EA15', 'EA09', 'EA01', 'SW21', 'CE09', 'EA03', 'GL17', 'GL05', 'EA25', 'EA17', 'GL23', 'AL03', 'SO05', 'NE05', 'WP23', 'NM07', 'EA27'}
[+] c59 Residence state of the pilot in command
STATE MATCHES
[+] c121 State of the owner / operator
STATE MATCHES
[+] c122 Designator code of the air operator (OPCD)
COMMON_VALUES {'CE07'}
[+] c128 District office of the air operator
COMMON_VALUES {'WP05', 'NM06', 'SW01', 'GL15', 'WP11', 'GL11', 'NM00', 'EA13', 'SO09', 'EA21', 'WP21', 'EA05', 'SO17', 'EA31', 'GL27', 'EA07', 'WP29', 'GL03', 'SO64', 'EU00', 'SW15', 'SW03', 'SO15', 'EA23', 'GL19', 'SO03', 'SW17', 'NM09', 'NM04', 'SO07', 'EU01', 'GL07', 'SO08', 'WP31', 'WP07', 'GL25', 'NM05', 'SO13', 'WP13', 'WP00', 'SO01', 'SW07', 'SO19', 'EA33', 'NM02', 'GL10', 'SO23', 'WP15', 'WP09', 'NE01', 'AL01', 'WP01', 'AL05', 'EA19', 'WP33', 'CE07', 'SW05', 'EA11', 'EA00', 'SO11', 'SW09', 'SO25', 'SO31', 'GL31', 'GL21', 'NM13', 'CE05', 'NM08', 'NE03', 'SW02', 'WP06', 'SO21', 'EA16', 'CE01', 'SW11', 'NM01', 'SW00', 'GL01', 'SW19', 'SO00', 'EA29', 'WP19', 'WP02', 'NM11', 'WP17', 'SW13', 'NM03', 'SO33', 'CE03', 'GL13', 'CE00', 'WP27', 'WP03', 'NE02', 'WP28', 'SO27', 'WP25', 'NM10', 'GL09', 'EA15', 'EA09', 'EA01', 'SW21', 'CE09', 'EA03', 'GL17', 'GL05', 'EA25', 'EA17', 'AL03', 'GL23', 'SO05', 'NE05', 'WP23', 'NM07', 'EA27'}
[+] c94 Type of accident code
STATE MATCHES
Para la columna de los distritos, existe coincidencias con otras columnas del dataset original que hacen referencia también a estas. Es decir, no hay ningún valor movido a columnas que no debiera.
Sobre la columna c30 no se realiza ninguna comporobación ya que sus valores, no existen en ninguna otra columna.
# Columna categorica a comprobar
cat_column = 'c41'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c41 Certificate type code of the pilot in command
VALUES ['F1', 'F3', 'F9', 'TT', 'XX', '00', '01', '02', '03', '09']
[+] c13 State of the accident/incident location.
COMMON_VALUES {'TT'}
[+] c240 Wind direction
COMMON_VALUES {'03', '00', '01', '02', '09'}
[+] c241 Wind speed in miles per hours
COMMON_VALUES {'03', '00', '01', '02', '09'}
[+] c243 Gust speed in miles per hour
COMMON_VALUES {'03', '00', '01', '02', '09'}
[+] c59 Residence state of the pilot in command
COMMON_VALUES {'TT', '00'}
[+] c121 State of the owner / operator
COMMON_VALUES {'00'}
[+] c127 Region of the air operator
COMMON_VALUES {'03'}
Para las columnas con coincidencias en valores numérico, no se decide aplicar ningún desplazamiento de datos. Esto se debe a que son ambiguos y existe en muchas columnas, no obstante, los valores categóricos de las columnas c13, c59 si se relocalizarán a su columna de origen por ser unicos de c41.
# Columna categorica a comprobar
cat_column = 'c49'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c49 Qualification code of the pilot in command .
VALUES ['E', 'F', 'G', 'H', 'M', 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X']
[+] c132 First remedial action area code
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'X', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c134 First remedial action taken code
COMMON_VALUES {'R', 'X', 'N', 'G', 'E', 'H', 'F'}
[+] c136 Second remedial action area code
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'X', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c138 Second remedial action taken code
COMMON_VALUES {'R', 'X', 'G', 'E', 'H', 'F'}
[+] c145 Code for Weight Class
COMMON_VALUES {'M', 'S', 'N'}
[+] c147 Wing information of the aircraft involved in the investigation.
COMMON_VALUES {'M', 'P', 'S', 'X', 'W', 'G', 'E', 'H', 'F'}
[+] c155 Type of the engine code.
COMMON_VALUES {'R', 'T', 'X', 'V', 'W', 'U', 'F'}
[+] c233 Flag to indicate if the pilot was killed.
COMMON_VALUES {'N'}
[+] c234 Flag to indicate if the second pilot was killed.
COMMON_VALUES {'N'}
[+] c102 Primary type of flying code
COMMON_VALUES {'G', 'E', 'F', 'H'}
[+] c104 Secondary type of flying code
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'X', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c108 Secondary flying condition code
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c242 Gust indicator flag
COMMON_VALUES {'G'}
[+] c52 Profession code of the pilot in command
COMMON_VALUES {'M', 'P', 'N', 'G', 'E', 'H', 'F'}
[+] c80 Contributing factor code
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'X', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c82 Persons involved code
COMMON_VALUES {'R', 'M', 'P', 'V', 'N', 'G', 'E', 'H', 'F'}
[+] c84 Supporting factor code TECHNICAL
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'X', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c88 Second contributing case code f
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'X', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c90 Second persons involved code
COMMON_VALUES {'R', 'M', 'P', 'S', 'N', 'V', 'G', 'E', 'H', 'F'}
[+] c92 Second supporting factor code . TECHNICAL
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'X', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c98 Damage code of aircraft.
COMMON_VALUES {'M', 'S', 'U', 'N'}
[+] c123 Emergency Locator Transmitter on Board? Y or N
COMMON_VALUES {'P', 'U', 'N'}
[+] c184 Supporting cause factor A code. OPERATIONAL
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'X', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
[+] c192 Supporting cause factor B code. OPERATIONAL
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'N', 'V', 'W', 'G', 'U', 'E', 'H', 'F'}
Se desestima los cambios para esta columna ya que sus datos categóricos, son compartidos con varias columnas del dataset.
# Columna categorica a comprobar
cat_column = 'c96'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c96 Phase of flight code
VALUES ['AA', 'AB', 'AC', 'AD', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'CX', 'DA', 'DB', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'EX', 'FA', 'FB', 'FC', 'FD', 'FE', 'FF', 'FG', 'GX', 'HA', 'HB', 'HC', 'HD', 'HE', 'HF', 'HG', 'HH', 'HK', 'HM', 'IA', 'IB', 'IC', 'ID', 'IE', 'IF', 'IG', 'IH', 'JX', 'KA', 'KB', 'KC', 'KD', 'KE', 'KF', 'KG', 'PJ', 'PL', 'SR']
[+] c13 State of the accident/incident location.
STATE MATCHES
[+] c59 Residence state of the pilot in command
STATE MATCHES
[+] c121 State of the owner / operator
STATE MATCHES
[+] c78 Primary cause factor code
COMMON_VALUES {'IF', 'FE', 'HG', 'IG', 'FC'}
[+] c86 Secondary cause factor code
COMMON_VALUES {'IF', 'FE', 'HG', 'IG', 'FC'}
[+] c94 Type of accident code
COMMON_VALUES {'IH', 'EX', 'KA', 'HB', 'JX', 'IB', 'KC', 'GX', 'CX'}
Paa las columnas c78 y c86 se desestima los cambios ya que las coincidencias son valores propios de estas columnas.
Para la columna c94, los valores CX, KA, JX, IH, KC son desplazados ya que son propios unicamente de la columna estudiada, c96, por lo tanto, estos valores se moverán a su columna de origen.
# Columna categorica a comprobar
cat_column = 'c106'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c106 Primary flying condition code
VALUES ['A', 'B', 'C']
[+] c1 Type of Event
COMMON_VALUES {'A'}
[+] c132 First remedial action area code
COMMON_VALUES {'C', 'A', 'B'}
[+] c134 First remedial action taken code
COMMON_VALUES {'C', 'A', 'B'}
[+] c136 Second remedial action area code
COMMON_VALUES {'C', 'A', 'B'}
[+] c138 Second remedial action taken code
COMMON_VALUES {'C', 'A', 'B'}
[+] c147 Wing information of the aircraft involved in the investigation.
COMMON_VALUES {'B'}
[+] c155 Type of the engine code.
COMMON_VALUES {'B'}
[+] c102 Primary type of flying code
COMMON_VALUES {'C', 'A', 'B'}
[+] c104 Secondary type of flying code
COMMON_VALUES {'C', 'A', 'B'}
[+] c108 Secondary flying condition code
COMMON_VALUES {'C', 'A', 'B'}
[+] c242 Gust indicator flag
COMMON_VALUES {'A', 'B'}
[+] c52 Profession code of the pilot in command
COMMON_VALUES {'C', 'A', 'B'}
[+] c80 Contributing factor code
COMMON_VALUES {'C', 'A', 'B'}
[+] c82 Persons involved code
COMMON_VALUES {'C', 'A', 'B'}
[+] c84 Supporting factor code TECHNICAL
COMMON_VALUES {'C', 'A', 'B'}
[+] c88 Second contributing case code f
COMMON_VALUES {'C', 'A', 'B'}
[+] c90 Second persons involved code
COMMON_VALUES {'C', 'A', 'B'}
[+] c92 Second supporting factor code . TECHNICAL
COMMON_VALUES {'C', 'A', 'B'}
[+] c100 General cause category code
COMMON_VALUES {'A'}
[+] c184 Supporting cause factor A code. OPERATIONAL
COMMON_VALUES {'C', 'A', 'B'}
[+] c192 Supporting cause factor B code. OPERATIONAL
COMMON_VALUES {'C', 'A', 'B'}
Se desestima los cambios para esta columna ya que sus datos categóricos, son compartidos con varias columnas en el dataset como propios de estas.
# Columna categorica a comprobar
cat_column = 'c108'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c108 Secondary flying condition code
VALUES ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W']
[+] c1 Type of Event
COMMON_VALUES {'A', 'I'}
[+] c132 First remedial action area code
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
[+] c134 First remedial action taken code
COMMON_VALUES {'O', 'B', 'D', 'R', 'C', 'N', 'A', 'G', 'E', 'J', 'K', 'H', 'F', 'I'}
[+] c136 Second remedial action area code
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
[+] c138 Second remedial action taken code
COMMON_VALUES {'B', 'D', 'R', 'C', 'A', 'G', 'E', 'J', 'K', 'H', 'F', 'I'}
[+] c145 Code for Weight Class
COMMON_VALUES {'M', 'S', 'D', 'N'}
[+] c147 Wing information of the aircraft involved in the investigation.
COMMON_VALUES {'Q', 'B', 'D', 'M', 'P', 'S', 'W', 'L', 'G', 'E', 'K', 'H', 'F'}
[+] c155 Type of the engine code.
COMMON_VALUES {'O', 'B', 'R', 'T', 'V', 'W', 'U', 'J', 'F', 'I'}
[+] c233 Flag to indicate if the pilot was killed.
COMMON_VALUES {'N'}
[+] c234 Flag to indicate if the second pilot was killed.
COMMON_VALUES {'N'}
[+] c102 Primary type of flying code
COMMON_VALUES {'B', 'D', 'C', 'G', 'A', 'E', 'J', 'H', 'F'}
[+] c104 Secondary type of flying code
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
[+] c106 Primary flying condition code
COMMON_VALUES {'C', 'A', 'B'}
[+] c242 Gust indicator flag
COMMON_VALUES {'G', 'A', 'B'}
[+] c49 Qualification code of the pilot in command .
COMMON_VALUES {'T', 'M', 'R', 'P', 'S', 'N', 'W', 'V', 'G', 'U', 'E', 'H', 'F'}
[+] c52 Profession code of the pilot in command
COMMON_VALUES {'O', 'Q', 'B', 'D', 'C', 'M', 'P', 'N', 'L', 'G', 'A', 'E', 'J', 'K', 'H', 'F'}
[+] c80 Contributing factor code
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
[+] c82 Persons involved code
COMMON_VALUES {'O', 'N', 'J', 'F', 'B', 'R', 'V', 'I', 'P', 'G', 'Q', 'D', 'M', 'C', 'L', 'A', 'E', 'K', 'H'}
[+] c84 Supporting factor code TECHNICAL
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
[+] c88 Second contributing case code f
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
[+] c90 Second persons involved code
COMMON_VALUES {'O', 'N', 'J', 'F', 'B', 'R', 'V', 'I', 'P', 'S', 'G', 'Q', 'D', 'M', 'C', 'L', 'A', 'E', 'K', 'H'}
[+] c92 Second supporting factor code . TECHNICAL
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
[+] c98 Damage code of aircraft.
COMMON_VALUES {'D', 'M', 'S', 'N', 'U'}
[+] c100 General cause category code
COMMON_VALUES {'A'}
[+] c123 Emergency Locator Transmitter on Board? Y or N
COMMON_VALUES {'P', 'U', 'N'}
[+] c184 Supporting cause factor A code. OPERATIONAL
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
[+] c192 Supporting cause factor B code. OPERATIONAL
COMMON_VALUES {'O', 'N', 'W', 'U', 'J', 'F', 'B', 'R', 'V', 'I', 'T', 'P', 'S', 'G', 'Q', 'D', 'C', 'M', 'L', 'A', 'E', 'K', 'H'}
Se desestima los cambios para esta columna ya que sus datos categóricos, son compartidos con varias columnas en el dataset como propios de estas.
# Columna categorica a comprobar
cat_column = 'c110'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c110 Light condition code
VALUES ['1', '2', '3', '4', '5']
[+] c10 Local time of the accident/incident.
COMMON_VALUES {'2', '4', '3', '5', '1'}
[+] c145 Code for Weight Class
COMMON_VALUES {'2', '1'}
[+] c153 Under or over 750 hp.
COMMON_VALUES {'4', '3'}
[+] c155 Type of the engine code.
COMMON_VALUES {'1'}
[+] c24 Model of the aircraft
COMMON_VALUES {'2'}
[+] c25 Group of the aircraft
COMMON_VALUES {'2', '4', '1'}
[+] c102 Primary type of flying code
COMMON_VALUES {'2', '4', '3', '5', '1'}
[+] c104 Secondary type of flying code
COMMON_VALUES {'2', '1', '5', '3'}
[+] c113 Cloud ceiling
COMMON_VALUES {'5', '3'}
[+] c114 Visibility code
COMMON_VALUES {'2', '4', '3', '5', '1'}
[+] c242 Gust indicator flag
COMMON_VALUES {'2', '4', '3', '5', '1'}
[+] c41 Certificate type code of the pilot in command
COMMON_VALUES {'1', '3'}
[+] c126 Certificate number of the air agency
COMMON_VALUES {'5', '1'}
[+] c80 Contributing factor code
COMMON_VALUES {'2', '4', '3', '5', '1'}
[+] c88 Second contributing case code f
COMMON_VALUES {'2', '4', '3', '5', '1'}
[+] c100 General cause category code
COMMON_VALUES {'2', '4', '3', '5', '1'}
[+] c244 Flight plan code
COMMON_VALUES {'2', '4', '3', '5', '1'}
Se desestima los cambios para esta columna ya que sus datos categóricos, son compartidos con varias columnas en el dataset como propios de estas.
# Columna categorica a comprobar
cat_column = 'c112'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c112 Sky condition code
VALUES ['BRKN', 'CLER', 'OBSC', 'OCST', 'SCAT']
[+] c113 Cloud ceiling
COMMON_VALUES {'CLER', 'OBSC', 'BRKN', 'SCAT', 'OCST'}
Todas las coincidencia de la columna c113 son originarias de la columna c112 y de ninguna otra, por lo tanto, todos estos valores serán desplazados a su lugar de origen.
# Columna categorica a comprobar
cat_column = 'c114'
conflict_col = match_cols[cat_column]
moved_values(cat_column=cat_column, conflict_col=conflict_col)
CAT_COLUMN c114 Visibility code
VALUES ['G10', 'LS1', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
[+] c10 Local time of the accident/incident.
COMMON_VALUES {'10', '9', '2', '4', '3', '5', '1', '7', '6'}
[+] c145 Code for Weight Class
COMMON_VALUES {'2', '1'}
[+] c149 Powered, Nonpowered, Optional
COMMON_VALUES {'7', '8'}
[+] c153 Under or over 750 hp.
COMMON_VALUES {'4', '3'}
[+] c155 Type of the engine code.
COMMON_VALUES {'1'}
[+] c24 Model of the aircraft
COMMON_VALUES {'2'}
[+] c25 Group of the aircraft
COMMON_VALUES {'10', '2', '4', '1', '7', '8'}
[+] c102 Primary type of flying code
COMMON_VALUES {'9', '2', '4', '3', '5', '1', '7', '8', '6'}
[+] c104 Secondary type of flying code
COMMON_VALUES {'2', '1', '5', '3'}
[+] c113 Cloud ceiling
COMMON_VALUES {'10', '5', '3', '7'}
[+] c240 Wind direction
COMMON_VALUES {'10'}
[+] c241 Wind speed in miles per hours
COMMON_VALUES {'10'}
[+] c242 Gust indicator flag
COMMON_VALUES {'2', '4', '3', '5', '1', '7', '8', '6'}
[+] c243 Gust speed in miles per hour
COMMON_VALUES {'10'}
[+] c41 Certificate type code of the pilot in command
COMMON_VALUES {'1', '3', '9'}
[+] c126 Certificate number of the air agency
COMMON_VALUES {'10', '1', '5'}
[+] c80 Contributing factor code
COMMON_VALUES {'2', '4', '3', '5', '1', '7', '8', '6'}
[+] c88 Second contributing case code f
COMMON_VALUES {'2', '4', '3', '5', '1', '7', '8', '6'}
[+] c100 General cause category code
COMMON_VALUES {'9', '2', '4', '3', '5', '1', '7', '8', '6'}
[+] c244 Flight plan code
COMMON_VALUES {'9', '2', '4', '3', '5', '1', '7', '8', '6'}
Todas las coincidencias para este atributo se trata de valores numericos, por lo tanto, se desestima moverlos ya que se trata de valores ambiguos existentes en muchas otras columnas.
Sobre la columna c115 no se realiza ninguna comporobación ya que sus valores, no existen en ninguna otra columna.
Como conclusión tras analizar los valores desplazados:
La reordenación de valores se realizará si, para la columna objetivo, existe un valor NaN y el de la misma fila de la columna erronea tiene el dato que debiera estar en la columna original.
Para poder realizar esta comprobación, previamente se convertirá a NaN todos aquellos valores de las columnas categóricas preseleccionadas que sean inválidos ya que no se necesita un pretratado previo con los valores de las columnas objetivo al ser óptimos en gran medida y no encontrar ningún valor atípico.
# Identificar los valores que no son de la columna y convertirlos a NaN
def validate_values(col: pd.Series, cat_values: list):
def clean_data(value):
if value in cat_values:
cat_type = type(cat_values[0])
if not isinstance(value, cat_type):
if cat_type == str:
return str(value)
elif cat_type == int:
return int(value)
elif cat_type == float:
return float(value)
return value
else:
return np.nan
return col.apply(clean_data)
def foreign_values(c_origin, searched_values, c_foreign, dataframe: pd.DataFrame, source_df: pd.DataFrame = data_df):
moved = 0
match = 0
origin_values = []
for index, values in source_df.iterrows():
if values[c_foreign] in searched_values:
match += 1
# Si el valor de la columna en origen es NaN, traer el valor de la columna erronea
# si su valor pertenece a la columna.
if pd.isna(values[c_origin]):
dataframe.loc[index, c_origin] = values[c_foreign]
moved += 1
else:
origin_values.append(values[c_foreign])
print(f'\tMATCH\t{match}\tMOVED\t{moved}')
if origin_values:
print(f'\t[!] VALUES_IN_ORIGIN\t{origin_values}')
column = 'c41'
column_foreign = ['c13', 'c59']
col_values = c41
searched_values = ['TT']
# Limpieza de los datos foraneos de la columna
df[column] = validate_values(df[column], col_values)
# Desplazar valores foraneos a columna origen y eliminarlos. Se indica los valores
# que han tenido una coincidencia en la busqueda previa de coincidencias
for c_for in column_foreign:
print(f'[+] COLUMN\t{c_for}')
foreign_values(c_origin=column, searched_values=searched_values,
c_foreign=c_for, dataframe=df)
[+] COLUMN c13 MATCH 6 MOVED 0 [!] VALUES_IN_ORIGIN ['TT', 'TT', 'TT', 'TT', 'TT', 'TT'] [+] COLUMN c59 MATCH 1 MOVED 0 [!] VALUES_IN_ORIGIN ['TT']
column = 'c96'
column_foreign = ['c94']
col_values = c96
searched_values = ['CX', 'KA', 'JX', 'IH', 'KC']
# Limpieza de los datos foraneos de la columna
df[column] = validate_values(df[column], col_values)
# Desplazar valores foraneos a columna origen y eliminarlos. Se indica los valores
# que han tenido una coincidencia en la busqueda previa de coincidencias
for c_for in column_foreign:
print(f'[+] COLUMN\t{c_for}')
foreign_values(c_origin=column, searched_values=searched_values,
c_foreign=c_for, dataframe=df)
[+] COLUMN c94 MATCH 7 MOVED 0 [!] VALUES_IN_ORIGIN ['KC', 'KC', 'CX', 'CX', 'JX', 'KA', 'IH']
column = 'c112'
column_foreign = ['c113']
col_values = c112
searched_values = ['OBSC', 'CLER', 'SCAT', 'BRKN', 'OCST']
# Limpieza de los datos foraneos de la columna
df[column] = validate_values(df[column], col_values)
# Desplazar valores foraneos a columna origen y eliminarlos. Se indica los valores
# que han tenido una coincidencia en la busqueda previa de coincidencias
for c_for in column_foreign:
print(f'[+] COLUMN\t{c_for}')
foreign_values(c_origin=column, searched_values=searched_values,
c_foreign=c_for, dataframe=df)
[+] COLUMN c113 MATCH 57 MOVED 45 [!] VALUES_IN_ORIGIN ['CLER', 'CLER', 'CLER', 'SCAT', 'CLER', 'OCST', 'CLER', 'CLER', 'CLER', 'OBSC', 'CLER', 'CLER']
La unica columna en la que se ha realizado moviento de datos ha sido la c112 ya que para el resto de columnas, los valores coincidian con los buscados, del mismo modo que también sucede para algunos registros en la columna c112.
Ahora que ya tenemos todas las columnas con los datos sin desplazar, es hora de ver aquellos valore atípcos o que sea faltantes en cada atributo.
Para poder realizar dicho estudio, se hará uso de la descripción de los valores realizada previamente y mediante una comparación de sus datos esperados, se imputará un valor nulo NaN para poder tratarlos más adelante.
Ya que existe atributos categóricos y numéricos, se realiza dos tipos de comprobaciones:
NOTA
Los datos aquí indicados como categóricos o numéricos, no son necesariamente de dicha índole para el estudio del modelo, esta agrupación se ha realizado por tipo de datos contenido pero no por significado para el modelo final.
col = 'c1'
cat_values = c1
col_values = df[col].unique()
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c1 Type of Event
EXP_VALUES ['A', 'I']
COL_VALUES ['A' 'I' nan 'AERO SAFETY FLIGHT SCHOOL' 'UNIVERSAL FLIGHT TRAINING'
'DULLES AVIATION']
BAD_VALUES {nan, 'UNIVERSAL FLIGHT TRAINING', 'DULLES AVIATION', 'AERO SAFETY FLIGHT SCHOOL'}
Existe valores que no son de la columna en cuestión, sin más análisis que realizar, se elimina dichos valores y se convierten a NaN
df[col] = validate_values(col=df[col], cat_values=cat_values)
df[col].unique()
array(['A', 'I', nan], dtype=object)
col = 'c11'
cat_values = c11
col_values = df[col].unique()
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c11 Region of the accident/incident location.
EXP_VALUES ['AC', 'AG', 'AL', 'CE', 'EA', 'EU', 'FS', 'GL', 'NE', 'NM', 'SD', 'SO', 'SW', 'WA', 'WP']
COL_VALUES ['CE' 'GL' 'SO' 'NW' 'SW' 'WE' 'RM' 'PC' 'NE' 'EA' 'AL' 'EU' 'WP' 'NM' nan
'AS' 'WA' '99' 'SD' 'FS']
BAD_VALUES {nan, '99', 'AS', 'PC', 'WE', 'RM', 'NW'}
Existe valores que no son de la columna en cuestión, sin más análisis que realizar, se elimina dichos valores y se convierten a NaN
df[col] = validate_values(col=df[col], cat_values=cat_values)
df[col].unique()
array(['CE', 'GL', 'SO', nan, 'SW', 'NE', 'EA', 'AL', 'EU', 'WP', 'NM',
'WA', 'SD', 'FS'], dtype=object)
col = 'c12'
cat_values = c12
col_values = df[col].unique()
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c12 District office of the accident/incident location.
EXP_VALUES ['ACRC', 'AC70', 'AC74', 'AC75', 'AC78', 'AC90', 'AGC1', 'ALRC', 'AL00', 'AL01', 'AL03', 'AL05', 'AL70', 'AL72', 'AL78', 'AL79', 'AL80', 'AL90', 'AL91', 'AUS', 'CA', 'CERC', 'CE00', 'CE01', 'CE03', 'CE05', 'CE07', 'CE09', 'CE40', 'CE42', 'CE43', 'CE44', 'CE45', 'CE46', 'CE47', 'CE48', 'CE49', 'CE50', 'CE51', 'CE52', 'CE53', 'CE54', 'CE55', 'CE56', 'CE57', 'CE58', 'CE59', 'CE70', 'CE71', 'CE72', 'CE73', 'CE78', 'CE79', 'CE80', 'CE90', 'CE91', 'EARC', 'EA00', 'EA01', 'EA03', 'EA05', 'EA07', 'EA09', 'EA11', 'EA13', 'EA15', 'EA16', 'EA17', 'EA19', 'EA21', 'EA23', 'EA25', 'EA27', 'EA29', 'EA31', 'EA33', 'EA35', 'EA70', 'EA71', 'EA72', 'EA73', 'EA74', 'EA78', 'EA79', 'EA80', 'EA90', 'EA91', 'EU00', 'EU01', 'EU02', 'EU03', 'EU70', 'EU72', 'EU73', 'EU74', 'EU76', 'FS01', 'GLRC', 'GL00', 'GL01', 'GL03', 'GL05', 'GL07', 'GL09', 'GL10', 'GL11', 'GL13', 'GL15', 'GL17', 'GL19', 'GL21', 'GL23', 'GL25', 'GL27', 'GL31', 'GL70', 'GL71', 'GL72', 'GL73', 'GL74', 'GL75', 'GL76', 'GL78', 'GL79', 'GL80', 'GL90', 'GL91', 'NERC', 'NE00', 'NE01', 'NE02', 'NE03', 'NE05', 'NE40', 'NE41', 'NE42', 'NE43', 'NE44', 'NE45', 'NE46', 'NE47', 'NE48', 'NE49', 'NE56', 'NE57', 'NE58', 'NE70', 'NE71', 'NE78', 'NE79', 'NE80', 'NE90', 'NE91', 'NMRC', 'NM00', 'NM01', 'NM02', 'NM03', 'NM04', 'NM05', 'NM06', 'NM07', 'NM08', 'NM09', 'NM10', 'NM11', 'NM13', 'NM40', 'NM41', 'NM42', 'NM43', 'NM46', 'NM47', 'NM48', 'NM50', 'NM51', 'NM56', 'NM57', 'NM58', 'NM70', 'NM71', 'NM72', 'NM73', 'NM78', 'NM79', 'NM80', 'NM81', 'NM82', 'NM83', 'NM90', 'NM91', 'SD00', 'SORC', 'SO00', 'SO01', 'SO03', 'SO05', 'SO06', 'SO07', 'SO08', 'SO09', 'SO11', 'SO13', 'SO14', 'SO15', 'SO16', 'SO17', 'SO19', 'SO21', 'SO23', 'SO25', 'SO27', 'SO31', 'SO33', 'SO64', 'SO70', 'SO71', 'SO72', 'SO73', 'SO74', 'SO75', 'SO76', 'SO77', 'SO78', 'SO79', 'SO80', 'SO90', 'SO91', 'SO92', 'SO93', 'SO94', 'SO95', 'SWRC', 'SW00', 'SW01', 'SW02', 'SW03', 'SW05', 'SW07', 'SW09', 'SW11', 'SW13', 'SW15', 'SW17', 'SW19', 'SW21', 'SW40', 'SW41', 'SW42', 'SW43', 'SW56', 'SW57', 'SW70', 'SW71', 'SW72', 'SW73', 'SW78', 'SW79', 'SW80', 'SW84', 'SW90', 'SW91', 'SW99', 'WA45', 'WA70', 'WA71', 'WA72', 'WA73', 'WA91', 'WPRC', 'WP00', 'WP01', 'WP02', 'WP03', 'WP05', 'WP06', 'WP07', 'WP09', 'WP11', 'WP13', 'WP15', 'WP17', 'WP19', 'WP21', 'WP23', 'WP25', 'WP27', 'WP28', 'WP29', 'WP30', 'WP31', 'WP33', 'WP70', 'WP71', 'WP72', 'WP73', 'WP74', 'WP75', 'WP76', 'WP77', 'WP78', 'WP79', 'WP80', 'WP81', 'WP82', 'WP83', 'WP90', 'WP91']
COL_VALUES ['CE11' 'GL19' 'SO62' 'GL18' 'NW02' 'SW09' 'NW61' 'GL07' 'SO05' 'GL05'
'GL13' 'GL20' 'WE62' 'NW03' 'WE11' 'GL03' 'SO09' 'SW07' 'CE04' 'RM03'
'GL14' 'PC51' 'NE13' 'SW06' 'CE22' 'RM67' 'EA16' 'SO07' 'SW08' 'NE15'
'SO11' 'RM02' 'GL06' 'SO01' 'CE12' 'EA21' 'AL01' 'SO08' 'CE62' 'SW11'
'WE03' 'GL08' 'SO61' 'WE12' 'WE02' 'SO13' 'WE66' 'WE06' 'NW05' 'NE19'
'WE08' 'SW10' 'WE01' 'WE64' 'EA22' 'SW03' 'AL62' 'RM06' 'GL10' 'SO02'
'SO03' 'SW02' 'EA14' 'AL61' 'SW04' 'RM05' 'SO63' 'SW01' 'EA03' 'NW01'
'EA11' 'EA10' 'RM04' 'EA09' 'SW12' 'EA62' 'WE04' 'EA18' 'SW05' 'EA17'
'UNKN' 'PC61' 'RM01' 'EA01' 'WE65' 'SO04' 'WP04' 'NM61' 'WP01' 'WP64'
'NM02' 'NM01' 'WP09' 'WP08' 'NM66' 'NM03' 'NM67' 'WP03' 'GL61' 'WP11'
'WP66' 'WP02' 'WP61' 'WP12' 'WP65' 'WP06' 'WP67' 'CE64' 'EA61' 'SW61'
'NM64' 'NM65' 'SO64' 'CE65' 'SW62' 'NM62' 'GL64' 'NM63' 'CE61' 'SW64'
'SW65' nan 'GL66' 'GL63' 'WP33' 'NM07' 'SO65' 'WP05' 'SO66' 'GL65' 'WP14'
'SO67' 'SW63' 'SW68' 'EU01' 'NE61' 'WP13' 'SW67' 'NE63' 'WP07' 'SW66'
'NE65' 'GL31' 'EA00' 'WP10' 'EU00' 'EA63' 'SW00' 'NM05' 'AL31' 'WP31'
'SO00' 'NE03' 'SO19' 'CE07' 'GL27' 'SO15' 'WP27' 'SW17' 'WP25' 'WP17'
'SO53' 'EA13' 'GL09' 'NE05' 'NE01' 'EA38' 'WP15' 'GL15' 'GL11' 'GL23'
'SW33' 'GL34' 'AL63' 'ASF1' 'SW41' 'CE33' 'AL79' 'EA31' 'NM60' 'CE63'
'EA07' 'CE05' 'EA25' 'AL03' 'EA27' 'AL05' 'GL25' 'EA23' 'EA05' 'CE03'
'WP23' 'SW15' 'SW13' 'CE09' 'GL17' 'WP21' 'NM09' 'CE01' 'SO21' 'WP19'
'NM13' 'GL21' 'SO16' 'EA15' 'SO06' 'EA19' 'SO17' 'NM04' 'WP00' 'NM00'
'EA29' 'NM08' 'SO14' 'WP28' 'NE02' 'GL01' 'CE45' 'EU51' 'SW19' 'CE71'
'NM06' 'NM10' 'SO80' 'WA45' 'NM11' 'GL00' 'SW21' 'EU03' 'WP29' 'SO25'
'EA35' 'SO33' 'SO27' 'SO23' 'AL00' 'EU70' 'SO35' 'EA33' 'SW23' 'CE02'
'SW27' 'CE13' 'SW29' 'EA68' 'SO29' 'EA65' 'SO37' 'SW25' 'CE17' 'SW31'
'EA39' 'CE19' 'CE21' 'NM21' 'NM19' 'WP30' 'SO39' 'WP35' 'CE25' 'AL07'
'WP37' 'GL33' 'CE23' 'AL09' 'WP39' 'SW39' 'FS58' 'FS57' 'FS59' 'GL62'
'FS56']
BAD_VALUES {nan, 'NE13', 'FS57', 'EA38', 'SO37', 'GL20', 'WE03', 'SW12', 'SW29', 'NM65', 'AL63', 'NW01', 'WE12', 'WP65', 'RM06', 'NM19', 'CE04', 'WP12', 'PC61', 'NW03', 'SW27', 'GL64', 'WP04', 'CE17', 'CE11', 'CE22', 'CE64', 'SW33', 'NE19', 'EA18', 'AL62', 'SO29', 'WP37', 'GL18', 'SW08', 'CE62', 'NM67', 'SW63', 'SW62', 'WE62', 'PC51', 'CE33', 'GL06', 'WE65', 'SO65', 'SO02', 'CE12', 'SW61', 'CE02', 'NM61', 'GL14', 'CE63', 'FS56', 'SO53', 'SW10', 'SO63', 'NM66', 'GL62', 'WP08', 'WP66', 'WE01', 'SW23', 'SW64', 'AL61', 'SW06', 'WP35', 'SW67', 'NE65', 'SW68', 'EA22', 'SW25', 'ASF1', 'RM05', 'CE23', 'WE66', 'EU51', 'WP10', 'UNKN', 'NE61', 'WP39', 'GL63', 'WP61', 'GL65', 'AL31', 'SW04', 'AL07', 'SO61', 'CE25', 'AL09', 'RM04', 'RM01', 'CE13', 'FS59', 'SO04', 'WE06', 'EA10', 'SO62', 'SW66', 'RM03', 'WP64', 'NW61', 'SO35', 'NM63', 'GL66', 'FS58', 'RM02', 'EA65', 'EA14', 'SO66', 'CE19', 'NM62', 'GL33', 'WE04', 'NE15', 'WE08', 'CE21', 'NM60', 'NM64', 'NW02', 'EA61', 'WP14', 'WE11', 'GL08', 'WE02', 'NW05', 'CE65', 'EA68', 'EA62', 'NM21', 'EA63', 'SW39', 'CE61', 'NE63', 'SW31', 'SW65', 'EA39', 'RM67', 'WP67', 'GL34', 'SO67', 'WE64', 'SO39', 'GL61'}
Existe valores que no son de la columna, no obstante, dado el formato de estos, se entiende que son de la misma característica que el resto de valores pero que no se ha incluido en el documento de referencia para los datos esperados en cada columna. No obstante, existe la clase "UNKN" que se entiende, hace referencia a un dato desconocido.
Dicho esto, se mantiene los datos en la columna y se convierte a NaN aquellos "UNKN"
df[col].replace('UNKN', np.nan, inplace=True)
col = 'c13'
cat_values = c13
col_values = df[col].unique()
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c13 State of the accident/incident location.
EXP_VALUES ['AA', 'AB', 'AC', 'AF', 'AG', 'AI', 'AJ', 'AK', 'AL', 'AM', 'AN', 'AO', 'AQ', 'AR', 'AS', 'AT', 'AU', 'AV', 'AY', 'AZ', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BK', 'BL', 'BM', 'BN', 'BO', 'BP', 'BQ', 'BR', 'BS', 'BT', 'BU', 'BV', 'BX', 'BY', 'CA', 'CB', 'CC', 'CD', 'CE', 'CF', 'CG', 'CH', 'CI', 'CJ', 'CK', 'CL', 'CM', 'CN', 'CO', 'CP', 'CQ', 'CR', 'CS', 'CT', 'CU', 'CV', 'CW', 'CY', 'CZ', 'DA', 'DC', 'DE', 'DJ', 'DM', 'DO', 'DQ', 'DR', 'EC', 'EG', 'EI', 'EK', 'EN', 'EQ', 'ER', 'ES', 'ET', 'EU', 'EZ', 'FA', 'FG', 'FI', 'FJ', 'FK', 'FL', 'FM', 'FO', 'FP', 'FQ', 'FR', 'FS', 'FT', 'GA', 'GB', 'GC', 'GE', 'GG', 'GH', 'GI', 'GJ', 'GK', 'GL', 'GM', 'GN', 'GO', 'GP', 'GQ', 'GR', 'GT', 'GV', 'GY', 'GZ', 'HA', 'HI', 'HK', 'HM', 'HO', 'HQ', 'HR', 'HU', 'IA', 'IC', 'ID', 'IE', 'II', 'IL', 'IM', 'IN', 'IO', 'IP', 'IR', 'IS', 'IT', 'IV', 'IZ', 'JA', 'JE', 'JM', 'JN', 'JO', 'JQ', 'JU', 'KE', 'KG', 'KN', 'KO', 'KQ', 'KR', 'KS', 'KT', 'KU', 'KY', 'KZ', 'LA', 'LE', 'LG', 'LH', 'LI', 'LO', 'LQ', 'LS', 'LT', 'LU', 'LY', 'L0', 'MA', 'MB', 'MC', 'MD', 'ME', 'MF', 'MG', 'MH', 'MI', 'MJ', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'M1', 'M2', 'M3', 'M4', 'NA', 'NB', 'NC', 'ND', 'NE', 'NF', 'NG', 'NH', 'NI', 'NJ', 'NL', 'NM', 'NO', 'NP', 'NR', 'NS', 'NT', 'NU', 'NV', 'NW', 'NX', 'NY', 'NZ', 'OH', 'OK', 'OR', 'PA', 'PC', 'PE', 'PF', 'PG', 'PK', 'PL', 'PN', 'PO', 'PP', 'PQ', 'PR', 'PS', 'PT', 'PU', 'QA', 'RE', 'RH', 'RI', 'RM', 'RO', 'RP', 'RQ', 'RS', 'RW', 'SA', 'SB', 'SC', 'SD', 'SE', 'SF', 'SG', 'SH', 'SI', 'SK', 'SL', 'SM', 'SN', 'SO', 'SP', 'SR', 'SS', 'ST', 'SU', 'SV', 'SW', 'SX', 'SY', 'SZ', 'S1', 'S2', 'TC', 'TD', 'TE', 'TG', 'TH', 'TI', 'TK', 'TL', 'TM', 'TN', 'TO', 'TP', 'TS', 'TU', 'TV', 'TW', 'TX', 'TZ', 'UG', 'UK', 'UP', 'UR', 'US', 'UT', 'UV', 'UY', 'UZ', 'VA', 'VC', 'VE', 'VI', 'VM', 'VQ', 'VT', 'WA', 'WF', 'WI', 'WN', 'WQ', 'WS', 'WV', 'WY', 'WZ', 'XX', 'YE', 'YM', 'YO', 'YS', 'ZA', 'ZI']
COL_VALUES ['MO' 'IL' 'TN' 'IN' 'OR' 'OK' 'WA' 'OH' 'FL' 'WI' 'MI' 'AZ' 'CA' 'SC'
'TX' 'IA' 'CO' 'MN' 'XX' 'MA' 'AR' 'KS' 'UT' 'VA' 'LA' 'NH' 'NC' 'WY'
'GA' 'NE' 'MD' 'AK' 'KY' 'ID' 'CT' 'NM' 'NV' 'SD' 'AL' 'PA' 'MT' 'NY'
'VT' 'ND' 'DE' 'DC' 'NJ' 'RQ' 'HI' 'ME' 'MS' 'RI' 'WV' 'VQ' 'CN' nan 'GQ'
'BF' 'IR' 'JA' 'DR' 'RP' 'PN' 'CM' 'AQ' 'UK' 'MX' 'TT' 'GE' 'CH' 'EI'
'HA' 'VE' 'IT' 'MY' 'AV' 'GC' 'TC' 'CU' 'JM' 'BL' 'PO' 'NU' 'VM' 'TQ'
'BE' 'NL' 'SW' 'KO' 'BR' 'NR' 'IC' 'FR' 'SA' 'CS' 'TK' 'DO' 'GT' 'YE'
'PR' 'WN' 'PQ' 'II' 'GL' 'MB' 'KE' 'AO' 'SF' 'VI' 'BH' 'YO' 'BD' 'MQ'
'PE' 'BA' 'CQ' 'HK' 'NZ' 'VC' 'CP' 'AC' 'GY' 'HO' 'SK' 'FT' 'PG' 'AS'
'NT' 'TO' 'GR' 'KU' 'PS' 'KQ' 'IE' 'AY' 'TD' 'FM' 'WQ' 'SP' 'PC' 'NG'
'BC' 'AJ' 'EC' 'AG' 'SG' 'BP' 'MP' 'PL' 'EU' 'NX' 'ES' 'SN' 'SX' 'TH'
'LI' 'FJ' 'ST' 'AA' 'IS' 'DA' 'US' 'QA' 'RM' 'TG' 'ZZ' 'OC' 'ON' 'GU'
'AB' 'NS']
BAD_VALUES {nan, 'ON', 'GU', 'ZZ', 'OC', 'TT', 'TQ'}
En la columna existe valores que no pertenecen a la columna pero que son del mismo formato.
Dicho esto, se mantiene los valores ya que podría tratarse de valores no definidos en la información de la columna.
col = 'c14'
col_values = df[col].unique()
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"COL_VALUES\t{col_values}")
[+] c14 City of the accident/incident location. COL_VALUES ['RICHMOND' 'OLNEY' 'MONTEREY' ... 'DREXEL HILL' 'RAINBOW' 'ADAIR']
La columna contiene muchos elementos diferentes, por lo que quiere decir que no existe una lista de ciudades finita. Es por eso, que se comprueba que valores de la lista no son una cadena de texto, es decir, que contenga caracteres que no son letras.
# Comprobar toda la columna contiene solo palabras
res = df['c14'].str.isalpha().all()
print('[+] ONLY_ALPHA\t', res)
# Comprobar que valores no son palabras y tampoco NaN
if not res:
# Indicar los valores no palabras y no NaN con True/False
bad_values = df[col].apply(lambda x: type(x) != str and not pd.isna(x))
# Extraer los valores erroneos de la columna
bad_values = df.loc[bad_values, col]
print(f'BAD_VALUES\t{bad_values}')
[+] ONLY_ALPHA False BAD_VALUES Series([], Name: c14, dtype: object)
Se comprueba que los valores que no son string, en realidad son valores NaN, por lo que, no es necesario realizar ninguna operación.
col = 'c30'
cat_values = c30
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c30 Airworthiness class code of the aircraft
EXP_VALUES ['STRD', 'LIMT', 'REST', 'EXPT', 'PROV', 'MULT', 'FERY']
COL_VALUES ['STRD' nan 'REST' 'EXPT' 'MULT' 'LIMT' 'PROV']
BAD_VALUES {nan}
No existe valores foráneos a la columna. Por lo tanto no se necesita limpiaza de los datos.
col = 'c41'
cat_values = c41
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c41 Certificate type code of the pilot in command
EXP_VALUES ['F1', 'F3', 'F9', 'TT', 'XX', '00', '01', '02', '03', '09']
COL_VALUES ['03' '01' '09' 'XX' nan 'TT' 'F3' 'F9' 'F1' '00' '02']
BAD_VALUES {nan}
No existe valores foráneos a la columna. Por lo tanto no se necesita limpiaza de los datos.
col = 'c49'
cat_values = c49
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c49 Qualification code of the pilot in command .
EXP_VALUES ['E', 'F', 'G', 'H', 'M', 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X']
COL_VALUES ['E' 'G' 'H' 'M' nan 'F' 'R' 'N' 'P' 'S' 'T' 'U' 'W' 'V' 'X']
BAD_VALUES {nan}
No existe valores foráneos a la columna. Por lo tanto no se necesita limpiaza de los datos.
col = 'c96'
cat_values = c96
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c96 Phase of flight code
EXP_VALUES ['AA', 'AB', 'AC', 'AD', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'CX', 'DA', 'DB', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'EX', 'FA', 'FB', 'FC', 'FD', 'FE', 'FF', 'FG', 'GX', 'HA', 'HB', 'HC', 'HD', 'HE', 'HF', 'HG', 'HH', 'HK', 'HM', 'IA', 'IB', 'IC', 'ID', 'IE', 'IF', 'IG', 'IH', 'JX', 'KA', 'KB', 'KC', 'KD', 'KE', 'KF', 'KG', 'PJ', 'PL', 'SR']
COL_VALUES ['IB' 'DD' 'HB' 'IA' 'BD' 'BI' 'IH' 'DE' 'DA' 'BC' 'KE' 'IE' 'BG' 'BF'
'KC' 'HA' 'GX' 'IC' 'EX' 'FE' 'BE' 'JX' 'AD' 'KG' 'KD' 'BA' 'BH' 'AB'
'AA' 'HD' 'KB' 'KF' 'IF' 'AC' 'HC' 'KA' 'ID' 'FD' 'IG' 'FF' nan 'DF' 'FG'
'BB' 'HH' 'HF' 'DB' 'FA' 'FB' 'DG' 'HG' 'FC' 'HE' 'DH' 'PJ' 'HM' 'SR'
'HK' 'PL']
BAD_VALUES {nan}
No existe valores foráneos a la columna. Por lo tanto no se necesita limpiaza de los datos.
col = 'c106'
cat_values = c106
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c106 Primary flying condition code
EXP_VALUES ['A', 'B', 'C']
COL_VALUES ['B' 'A' 'C' nan '19540228.0' '19620714.0' '19760912.0' '19700202.0'
'19680726.0' '19691204.0' '19641127.0' '19440916.0' '19560306.0'
'19550927.0' '19961002.0' '19670626.0' '19930126.0' '19571026.0'
'19691006.0']
BAD_VALUES {nan, '19540228.0', '19691204.0', '19930126.0', '19440916.0', '19961002.0', '19641127.0', '19571026.0', '19670626.0', '19691006.0', '19760912.0', '19620714.0', '19680726.0', '19560306.0', '19550927.0', '19700202.0'}
df[col] = validate_values(col=df[col], cat_values=cat_values)
df[col].unique()
array(['B', 'A', 'C', nan], dtype=object)
col = 'c108'
cat_values = c108
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c108 Secondary flying condition code
EXP_VALUES ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W']
COL_VALUES ['S' 'D' 'U' 'M' 'G' 'N' nan 'L' 'T' 'H' 'F' 'I' 'O' 'W' 'J' 'V' 'A' 'P'
'B' 'C' 'R' 'K' 'E' 'Q' '19890602.0']
BAD_VALUES {nan, '19890602.0'}
df[col] = validate_values(col=df[col], cat_values=cat_values)
df[col].unique()
array(['S', 'D', 'U', 'M', 'G', 'N', nan, 'L', 'T', 'H', 'F', 'I', 'O',
'W', 'J', 'V', 'A', 'P', 'B', 'C', 'R', 'K', 'E', 'Q'],
dtype=object)
col = 'c110'
cat_values = c110
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c110 Light condition code
EXP_VALUES ['1', '2', '3', '4', '5']
COL_VALUES [ 4. 1. 3. 2. 5. nan]
BAD_VALUES {nan, 1.0, 2.0, 3.0, 4.0, 5.0}
Los valores del atributo coinciden, pero estos están en formato flotante por lo que se convierten a el formato string esperado.
# Convertir los valores nulos a 0, que no son de la columna para poder convertir a enteror los valores posteriormente
df[col] = df[col].fillna(0)
df[col] = df[col].astype(int).astype(str)
df[col].unique()
array(['4', '1', '3', '2', '5', '0'], dtype=object)
df[col] = validate_values(col=df[col], cat_values=cat_values)
df[col].unique()
array(['4', '1', '3', '2', '5', nan], dtype=object)
col = 'c112'
cat_values = c112
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c112 Sky condition code
EXP_VALUES ['BRKN', 'CLER', 'OBSC', 'OCST', 'SCAT']
COL_VALUES [nan 'BRKN' 'SCAT' 'CLER' 'OBSC' 'OCST']
BAD_VALUES {nan}
No existe valores foráneos a la columna. Por lo tanto no se necesita limpiaza de los datos.
col = 'c114'
cat_values = c114
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c114 Visibility code
EXP_VALUES ['G10', 'LS1', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
COL_VALUES [nan '5' 'G10' '10' 'LS1' '1' '3' '4' '6' '7' '2' '8' '9' 'GR3']
BAD_VALUES {nan, 'GR3'}
Se observa que existe un valor no coincidente con los del atributo, por lo que se elimina y se convierte en NaN sus coincencias.
df[col] = validate_values(col=df[col], cat_values=cat_values)
df[col].unique()
array([nan, '5', 'G10', '10', 'LS1', '1', '3', '4', '6', '7', '2', '8',
'9'], dtype=object)
col = 'c115'
cat_values = c115
# Valores de la columna
col_values = df[col].unique()
# Valores de la columna que no son los esperados
bad_values = set(col_values) - set(cat_values)
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f'EXP_VALUES\t{cat_values}')
print(f"COL_VALUES\t{col_values}")
print(f"BAD_VALUES\t{bad_values}")
[+] c115 Visibility restriction code.
EXP_VALUES ['BLS', 'DRZ', 'DST', 'FOG', 'HAZ', 'RAI', 'SMK', 'SNO']
COL_VALUES [nan 'FOG' 'RAI' 'HAZ' 'SNO' 'DRZ' 'BLS' 'DST' 'SMK']
BAD_VALUES {nan}
No existe valores foráneos a la columna. Por lo tanto no se necesita limpiaza de los datos.
col = 'c7'
# Observar los valores de la colu mna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c7 Month the accident/incident happened. VALUES [1 2 3 4 5 6 7 10 9 8 11 12 <NA>]
Se observa que el formato
type(df['c7'].unique()[12])
pandas._libs.missing.NAType
df[col].replace(pd.NaT, np.nan, inplace=True)
print(f"VALUES\t{df[col].unique()}\n")
VALUES [ 1. 2. 3. 4. 5. 6. 7. 10. 9. 8. 11. 12. nan]
Todos los valores de la columna son los esperados, no se añade nuevos NaN, aunque estos, si que se convierten al tipo de datos float de numpy para mantener el mismo tipo de datos NaN en todas las columnas.
col = 'c10'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c10 Local time of the accident/incident. VALUES [nan '0000' '1515' ... '4105' '0129' '0239']
La columna c10 contiene los datos de la hora del incidente, pero para poder lidiar con todos los datos, se convierte los valores dados en formato HHMM en HH. Para ello, se recorre todos los valores y se modifica su valor, además, si los minutos son más de 30, se sumará una hora al resultado.
Se toma esta decisión para no sobrepasar al modelo de características para este atributo.
def get_hour(value):
"""
Convertir la hora en HHMM a HH
"""
try:
if value in[None, 'nan']:
return np.NaN
# Extraer la hora y minuto
hour = int(value[:2])
# Comprobar que la hora sea correcta y no sea un valor
# ilogico
if hour >= 24:
return np.NaN
minute = int(value[2:])
hour += 1 if minute > 30 else 0
# Cambiar la hora a 00 para aquellas que sumen 1 y sea las 23
hour = 0 if hour == 24 else hour
return str(hour).zfill(2)
except (TypeError, Exception):
return np.NaN
# Utilizar compresion de listas y aplicar el filtrado de valores
hours = [get_hour(value) for value in df['c10']]
df['c10'] = hours
df.c10.unique()
array([nan, '00', '15', '14', '09', '13', '11', '20', '22', '18', '16',
'19', '21', '10', '08', '12', '17', '07', '01', '06', '23', '05',
'02', '04', '03'], dtype=object)
col = 'c20'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c20 Latitude coordinates of the accident/incident VALUES [999. 42.16666667 51.33333333 ... 67.45 24.88333333 20.23333333]
df[col].info()
<class 'pandas.core.series.Series'> RangeIndex: 214664 entries, 0 to 214663 Series name: c20 Non-Null Count Dtype -------------- ----- 214664 non-null float64 dtypes: float64(1) memory usage: 1.6 MB
La columna tiene el tipo de datos esperado, pero existe un valor ilógico introducido previamente, el 999, que se ha de convertir a NaN.
df[col].replace(999, np.nan, inplace=True)
df[col].unique()
array([ nan, 42.16666667, 51.33333333, ..., 67.45 ,
24.88333333, 20.23333333])
col = 'c21'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c21 Longitude coordinates of the accident/incident VALUES [ 999. -120.4 -37.5 ... -12.4 97.71666667 -140.23333333]
df[col].info()
<class 'pandas.core.series.Series'> RangeIndex: 214664 entries, 0 to 214663 Series name: c21 Non-Null Count Dtype -------------- ----- 214664 non-null float64 dtypes: float64(1) memory usage: 1.6 MB
La columna tiene el tipo de datos esperado, pero existe un valor ilógico introducido previamente, el 999, que se ha de convertir a NaN.
df[col].replace(999, np.nan, inplace=True)
df[col].unique()
array([ nan, -120.4 , -37.5 , ..., -12.4 ,
97.71666667, -140.23333333])
col = 'c31'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c31 Airframe hours of the aircraft VALUES [nan '660.0' '28.0' ... '9887' '10009' '10905']
La columna para las horas de vuelo del avión, debiera estar compuesta de numeros enteros. Es por este motivo que se convierte todos los NaN a -1, introduciendo así un valor ilógico, para poder convertir el tipo de la columna a integer, realizando una conversión a NaN con los errores que suceda.
# Convertir los valores NaN a -1
df[col].fillna(-1, inplace=True)
# Convertir el tipo de datos de la columna a entero, aquellos que den error, se
# convierten a NaN
df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
df[col].info()
<class 'pandas.core.series.Series'> RangeIndex: 214664 entries, 0 to 214663 Series name: c31 Non-Null Count Dtype -------------- ----- 214640 non-null float64 dtypes: float64(1) memory usage: 1.6 MB
df[col].replace(-1, np.nan, inplace=True)
df[col].unique()
array([ nan, 660., 28., ..., 17919., 13163., 10745.])
col = 'c56'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c56 Total number of hours the pilot has flown. VALUES [ 1187. 364. 205. ... 6537. 15179. 11291.]
df[col].info()
<class 'pandas.core.series.Series'> RangeIndex: 214664 entries, 0 to 214663 Series name: c56 Non-Null Count Dtype -------------- ----- 177616 non-null float64 dtypes: float64(1) memory usage: 1.6 MB
La columna para las horas de vuelo del piloto, esta en formato float por lo que no se realiza ningún cambio de valores por estar en formato correcto.
col = 'c113'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c113 Cloud ceiling VALUES [nan '600' '200' '400' 'CLER' '800' '7000' '4500' '3500' '1000' '2500' '500' '2000' '5000' '3000' 'GRIK' '2300' '1800' '6000' '9999' '2900' '3100' '900' '4000' '2200' '9000' '8000' '1721' '1200' '5500' '100' '700' '6400' '1500' '1900' '4900' '2400' '3400' '1300' '1700' '7500' '3200' '2100' '$)))' '3300' 'GRIL' '2700' 'GEIK' '4600' 'G10' '1600' '6500' 'SCAT' '2600' '1400' '300' '2800' '3800' '4400' '6' '9500' '2020' '1100' '4300' '3700' '8500' '7' '0700' 'BRKN' 'ZERO' '3600' '0' 'GR1K' '2' '1340' 'OCS' '850' '150' '0600' 'OVCT' '3900' '?' '.' '2280' '5600' '38' '10,0' '1' '4100' '5' '7200' '250' '8' '1032' '25' '4700' '75' '85' '50' '125' '4002' 'GRI' 'OCST' 'F' '40' '4200' '650' '0500' '10' '4800' '1350' 'GIK' 'GROL' 'GERI' '0400' '0100' '00' 'OBSC' '800L' '7800' '5005' '330G' '8400' '5002' 'GRKG' '0300' '0200' '9200' '0800' '900G' '8001' '200L' '1040' '0900' '9-99' '9997' '0000' '0850' '5800' '0004' '0740' '0350' '6600' '0003' '7003' '0010' '7600' '0050' '0660' '4003' '5200' '2003' '0450' '9007' '0006' '9003' '3001' '4001' '0150' '0060' '4566' '7002' '0250' '0045' '0009' '8200' '0005' '0020' '4350' '0075' '8003' '0750' '0007' '0001' '2504' '0804' '0999' '0040' '3380' '7400' '0002' '9025' '6100' '5400' '20' '350' '3404' '4509' '6002' '450' '3' '70' '5100' '0008' '0070' '0015' '5300' '0011' '0013' '0017' '0023' '0032' '0035' '0055' '0016' '0030' '0080' '0022' '0025' '0046' '0012' '0036' '0028' '0029' '0085' '0095' '0031' '0110' '0024' '0140' '0037' '0090' '0033' '0018' '0220' '0034' '0026' '0038' '0044' '5001' '0041' '0049' '0014' '0047' '0021' '0043' '0042' '2599' '0019' '0048' '0065' '0160' '0027' '3204' '0039' '0120' '5.0' '11.0' '15.0' '40.0' '10.0' '25.0' '50.0' '1.0' '30.0' '16.0' '6.0' '4.0' '9.0' '3.0' '41.0' '21.0' '47.0' '8.0' '20.0' '12.0' '60.0' '2.0' '35.0' '19.0' '7.0' '80.0' '18.0' '28.0' '27.0']
En la columna de la altura de las nubes se esperaría valores numéricos, por lo que se convierte todos los valores no numérios a NaN. Así como la conversion de los valores numéricos que están en formato string.
# Convertir los valores NaN a -1
df[col].fillna(-1, inplace=True)
# Convertir el tipo de datos de la columna a entero, aquellos que den error, se
# convierten a NaN
df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
df[col].info()
<class 'pandas.core.series.Series'> RangeIndex: 214664 entries, 0 to 214663 Series name: c113 Non-Null Count Dtype -------------- ----- 208070 non-null float64 dtypes: float64(1) memory usage: 1.6 MB
df[col].replace(-1, np.nan, inplace=True)
df[col].unique()
array([ nan, 6.000e+02, 2.000e+02, 4.000e+02, 8.000e+02, 7.000e+03,
4.500e+03, 3.500e+03, 1.000e+03, 2.500e+03, 5.000e+02, 2.000e+03,
5.000e+03, 3.000e+03, 2.300e+03, 1.800e+03, 6.000e+03, 9.999e+03,
2.900e+03, 3.100e+03, 9.000e+02, 4.000e+03, 2.200e+03, 9.000e+03,
8.000e+03, 1.721e+03, 1.200e+03, 5.500e+03, 1.000e+02, 7.000e+02,
6.400e+03, 1.500e+03, 1.900e+03, 4.900e+03, 2.400e+03, 3.400e+03,
1.300e+03, 1.700e+03, 7.500e+03, 3.200e+03, 2.100e+03, 3.300e+03,
2.700e+03, 4.600e+03, 1.600e+03, 6.500e+03, 2.600e+03, 1.400e+03,
3.000e+02, 2.800e+03, 3.800e+03, 4.400e+03, 6.000e+00, 9.500e+03,
2.020e+03, 1.100e+03, 4.300e+03, 3.700e+03, 8.500e+03, 7.000e+00,
3.600e+03, 0.000e+00, 2.000e+00, 1.340e+03, 8.500e+02, 1.500e+02,
3.900e+03, 2.280e+03, 5.600e+03, 3.800e+01, 1.000e+00, 4.100e+03,
5.000e+00, 7.200e+03, 2.500e+02, 8.000e+00, 1.032e+03, 2.500e+01,
4.700e+03, 7.500e+01, 8.500e+01, 5.000e+01, 1.250e+02, 4.002e+03,
4.000e+01, 4.200e+03, 6.500e+02, 1.000e+01, 4.800e+03, 1.350e+03,
7.800e+03, 5.005e+03, 8.400e+03, 5.002e+03, 9.200e+03, 8.001e+03,
1.040e+03, 9.997e+03, 5.800e+03, 4.000e+00, 7.400e+02, 3.500e+02,
6.600e+03, 3.000e+00, 7.003e+03, 7.600e+03, 6.600e+02, 4.003e+03,
5.200e+03, 2.003e+03, 4.500e+02, 9.007e+03, 9.003e+03, 3.001e+03,
4.001e+03, 6.000e+01, 4.566e+03, 7.002e+03, 4.500e+01, 9.000e+00,
8.200e+03, 2.000e+01, 4.350e+03, 8.003e+03, 7.500e+02, 2.504e+03,
8.040e+02, 9.990e+02, 3.380e+03, 7.400e+03, 9.025e+03, 6.100e+03,
5.400e+03, 3.404e+03, 4.509e+03, 6.002e+03, 7.000e+01, 5.100e+03,
1.500e+01, 5.300e+03, 1.100e+01, 1.300e+01, 1.700e+01, 2.300e+01,
3.200e+01, 3.500e+01, 5.500e+01, 1.600e+01, 3.000e+01, 8.000e+01,
2.200e+01, 4.600e+01, 1.200e+01, 3.600e+01, 2.800e+01, 2.900e+01,
9.500e+01, 3.100e+01, 1.100e+02, 2.400e+01, 1.400e+02, 3.700e+01,
9.000e+01, 3.300e+01, 1.800e+01, 2.200e+02, 3.400e+01, 2.600e+01,
4.400e+01, 5.001e+03, 4.100e+01, 4.900e+01, 1.400e+01, 4.700e+01,
2.100e+01, 4.300e+01, 4.200e+01, 2.599e+03, 1.900e+01, 4.800e+01,
6.500e+01, 1.600e+02, 2.700e+01, 3.204e+03, 3.900e+01, 1.200e+02])
col = 'c240'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c240 Wind direction VALUES [nan '21' '05' '11' '0' '13' '30' '15' '09' '04' '36' '17' '26' '08' '31' '27' '33' '12' '23' '01' '32' '18' '25' '16' '07' '02' '35' '28' '34' '29' '24' '03' '14' '22' '06' '20' '10' '19' '00' '60' '39' '50' 'G3' 'G1' '55' '40' 'G4' 'C' '1' '3' '6' '5' '8' '47' '4' '45' '7' '38' '2' '90' '9' 'G2' 'G' '.9' 'OO' '37' 'N' '70' 'O' 'G6' '80' 'G5' '89' '66' '@@' 'G0' '75' '87' '24.0' '28.0' '5.0' '30.0' '33.0' '36.0' '11.0' '2.0' '22.0' '18.0' '20.0' '23.0' '15.0' '14.0' '29.0' '10.0' '35.0' '21.0' '13.0' '19.0' '12.0' '1.0' '27.0' '16.0' '9.0' '4.0' '34.0' '31.0' '7.0' '25.0' '17.0' '8.0' '32.0' '3.0' '6.0' '26.0' '0.0' '1B' '3N']
En la columna se esperaría valores numéricos, por lo que se convierte todos los valores no numérios a NaN. Así como la conversion de los valores numéricos que están en formato string.
# Convertir los valores NaN a -1
df[col].fillna(-1, inplace=True)
# Convertir el tipo de datos de la columna a entero, aquellos que den error, se
# convierten a NaN
df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
df[col].info()
<class 'pandas.core.series.Series'> RangeIndex: 214664 entries, 0 to 214663 Series name: c240 Non-Null Count Dtype -------------- ----- 214613 non-null float64 dtypes: float64(1) memory usage: 1.6 MB
df[col].replace(-1, np.nan, inplace=True)
df[col].unique()
array([ nan, 21. , 5. , 11. , 0. , 13. , 30. , 15. , 9. , 4. , 36. ,
17. , 26. , 8. , 31. , 27. , 33. , 12. , 23. , 1. , 32. , 18. ,
25. , 16. , 7. , 2. , 35. , 28. , 34. , 29. , 24. , 3. , 14. ,
22. , 6. , 20. , 10. , 19. , 60. , 39. , 50. , 55. , 40. , 47. ,
45. , 38. , 90. , 0.9, 37. , 70. , 80. , 89. , 66. , 75. , 87. ])
col = 'c241'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c241 Wind speed in miles per hours VALUES [nan '15' '05' '07' '03' '10' '08' '06' '04' '09' '18' '12' '11' '24' '14' '22' '13' '25' '20' '19' '16' '17' '01' '02' '23' '8' '29' '26' '28' '44' '21' '52' '40' 'G1' 'G0' '2' 'G3' '90' 'G2' '30' '0' '00' 'G' '27' '34' '9' '31' 'S' 'B' '35' '4' '7' '5' '36' '6' '41' '55' '3' 'T' 'G4' 'G6' '60' 'X' '33' '50' '1' 'W' '43' '32' '80' '39' '99' '38' '51' '53' ')' '45' '0G' '46' '70' '56' '54' '58' 'G5' '49' '87' '66' '57' '37' '47' '48' '69' '9.0' '8.0' '5.0' '15.0' '6.0' '10.0' '0.0' '12.0' '7.0' '14.0' '3.0' '40.0' '2.0' '11.0' '25.0' '13.0' '16.0' '26.0' '21.0' '24.0' '4.0' '65.0' '23.0' '18.0' '20.0' '17.0' '35.0' '22.0' '28.0' '1.0' '30.0' '33.0' '19.0' '37.0' '32.0' '27.0' '57.0' '45.0' '31.0' '38.0' '87.0' '29.0' '84.0' '46.0' '58.0' '50.0' '36.0' '51.0' '75.0' '70.0' '92.0' '74.0' '80.0' '60.0' '34.0' '55.0' '42.0']
En la columna se esperaría valores numéricos, por lo que se convierte todos los valores no numérios a NaN. Así como la conversion de los valores numéricos que están en formato string.
# Convertir los valores NaN a -1
df[col].fillna(-1, inplace=True)
# Convertir el tipo de datos de la columna a entero, aquellos que den error, se
# convierten a NaN
df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
df[col].info()
<class 'pandas.core.series.Series'> RangeIndex: 214664 entries, 0 to 214663 Series name: c241 Non-Null Count Dtype -------------- ----- 214538 non-null float64 dtypes: float64(1) memory usage: 1.6 MB
df[col].replace(-1, np.nan, inplace=True)
df[col].unique()
array([nan, 15., 5., 7., 3., 10., 8., 6., 4., 9., 18., 12., 11.,
24., 14., 22., 13., 25., 20., 19., 16., 17., 1., 2., 23., 29.,
26., 28., 44., 21., 52., 40., 90., 30., 0., 27., 34., 31., 35.,
36., 41., 55., 60., 33., 50., 43., 32., 80., 39., 99., 38., 51.,
53., 45., 46., 70., 56., 54., 58., 49., 87., 66., 57., 37., 47.,
48., 69., 65., 84., 75., 92., 74., 42.])
col = 'c243'
# Observar los valores de la columna
print(f"[+] {col}\t{col_name(col)}")
print(f"VALUES\t{df[col].unique()}\n")
[+] c243 Gust speed in miles per hour VALUES [nan '32' '16' '15' '12' '29' '40' '22' '27' '35' '18' '10' '30' '25' '31' '14' '20' '19' '28' '24' '17' '38' '36' '23' '26' '2' '34' '33' '52' '43' '21' '45' '53' '58' '05' '08' '06' '1' '0' '4' 'C' '42' '48' 'B' '37' '39' '7' '13' '60' '55' '07' '50' '44' 'G' 'S' 'L' '03' '02' '8' '04' '85' '09' '65' '80' '11' '5' '46' '70' '00' '90' '47' '75' '51' '41' '01' '77' '18.0' '25.0' '30.0' '15.0' '12.0' '20.0' '40.0' '22.0' '23.0' '35.0' '24.0' '10.0' '32.0' '28.0' '38.0' '19.0' '21.0' '41.0' '33.0' '27.0' '29.0' '55.0' '68.0' '45.0' '42.0' '36.0' '58.0' '37.0' '13.0' '7.0' '2.0' '17.0' '60.0' '8.0' '44.0' '39.0' '34.0' '16.0' '6.0' '26.0' '50.0' '11.0' '14.0' '46.0' '5.0' '9.0' '4.0' '31.0' '52.0' '48.0' '47.0' '70.0' '43.0' '3.0' '80.0' '54.0' '0.0' '65.0' '1.0' '62.0' '57.0' '49.0' '51.0' '56.0' '99.0']
En la columna se esperaría valores numéricos, por lo que se convierte todos los valores no numérios a NaN. Así como la conversion de los valores numéricos que están en formato string.
# Convertir los valores NaN a -1
df[col].fillna(-1, inplace=True)
# Convertir el tipo de datos de la columna a entero, aquellos que den error, se
# convierten a NaN
df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
df[col].info()
<class 'pandas.core.series.Series'> RangeIndex: 214664 entries, 0 to 214663 Series name: c243 Non-Null Count Dtype -------------- ----- 214653 non-null float64 dtypes: float64(1) memory usage: 1.6 MB
df[col].replace(-1, np.nan, inplace=True)
df[col].unique()
array([nan, 32., 16., 15., 12., 29., 40., 22., 27., 35., 18., 10., 30.,
25., 31., 14., 20., 19., 28., 24., 17., 38., 36., 23., 26., 2.,
34., 33., 52., 43., 21., 45., 53., 58., 5., 8., 6., 1., 0.,
4., 42., 48., 37., 39., 7., 13., 60., 55., 50., 44., 3., 85.,
9., 65., 80., 11., 46., 70., 90., 47., 75., 51., 41., 77., 68.,
54., 62., 57., 49., 56., 99.])
Hasta ahora, se ha marcado todos los valores perdidos indicándolos como NaN. Ahora, para poder realizar el modelo de Machine Learing, se realiza una serie de estratégias para tratar con estos valores. Mediante la eliminación de instancias o imputación de valores.
Nota
Las columnas c11, c12, c13 y c14 no son objeto de imputación ya que su uso es de apoyo a la imputación de otras columnas
plot_nan(df)
def categorical_histogram(col: str, df: pd.DataFrame = df, bins=10):
# Instancia del codificador
oe = OrdinalEncoder()
# Ajuste del modelo (fit) y codificación de los datos (transform)
try:
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
except:
oe.fit(df[[col]].astype(str))
col_encoded = oe.transform(df[[col]].astype(str))
# Crear un dataframe auxiliar con los atributos y valores transformados
names = oe.get_feature_names_out()
col_df = pd.DataFrame(col_encoded, columns=names)
# Imprimir el histograma de los datos
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(20,5))
col_df.plot(kind='box', ax=axes[0]).set_title('Distribución de los datos.')
col_df.plot.hist(bins=bins, ax=axes[1])
plt.show()
print('[+] VALUES_ENCODED\t', oe.categories_[0])
print('[+] SKEW\t', col_df.skew()[0])
def numerical_histogram(col: str, df: pd.DataFrame = df, bins=10):
# Imprimir el histograma de los datos
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(20,5))
df[col].plot(kind='box', ax=axes[0]).set_title('Distribución de los datos.')
df[col].plot.hist(bins=bins, ax=axes[1])
plt.show()
print('[+] SKEW\t', df[[col]].skew()[0])
col = 'c7'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Month the accident/incident happened. [+] NAN_VALUES 32 0.01 %
| c7 | |
|---|---|
| count | 214632.000000 |
| mean | 6.523720 |
| std | 3.172425 |
| min | 1.000000 |
| 25% | 4.000000 |
| 50% | 7.000000 |
| 75% | 9.000000 |
| max | 12.000000 |
numerical_histogram(col, df, bins=12)
[+] SKEW -0.031124917205013317
Los datos se imputan teniendo en cuenta la mediana, que resulta ser 7, ya que se trata de meses del año. Otra opción, podría ser imputar la media redondeada para tener un numero entero.
df[col].fillna(df[col].median(), inplace=True)
numerical_histogram(col, df, bins=12)
df[[col]].describe()
[+] SKEW -0.031193721710241634
| c7 | |
|---|---|
| count | 214664.000000 |
| mean | 6.523791 |
| std | 3.172194 |
| min | 1.000000 |
| 25% | 4.000000 |
| 50% | 7.000000 |
| 75% | 9.000000 |
| max | 12.000000 |
Tras la imputación, al tratarse de un numero reducido de datos falatantes, no se modifica apenas el sesgo.
col = 'c30'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Airworthiness class code of the aircraft [+] NAN_VALUES 174642 81.36 %
| c30 | |
|---|---|
| count | 40022 |
| unique | 6 |
| top | STRD |
| freq | 32323 |
Al tratarse de una columna categórica, se realiza una codificación de los datos para poder ver la distribución de estos y poder decidir una técnica de imputación.
categorical_histogram(col, df, bins=6)
df[[col]].describe()
[+] VALUES_ENCODED ['EXPT' 'LIMT' 'MULT' 'PROV' 'REST' 'STRD' nan] [+] SKEW -2.1014134506233697
| c30 | |
|---|---|
| count | 40022 |
| unique | 6 |
| top | STRD |
| freq | 32323 |
Con el gráfico del histograma se observa que la gran mayoría de los resultados son de la clase Standard "STRD". Pero, existe un gran numero de datos faltantes, por lo tanto, se decide limitar la imputación en el numero de NaN consecutivos que exista.
# Calculo del limite: Mitad del porcentaje de valores faltantes
limit = int(round(nan_values * (1 - (perc_nan / 2) / 100)))
print(f'[+] LIMIT\t{limit}')
[+] LIMIT 103598
# Imputación
df[col].fillna(df[col].mode()[0], inplace=True, limit=limit)
# Representación
categorical_histogram(col, df, bins=6)
df[[col]].describe()
[+] VALUES_ENCODED ['EXPT' 'LIMT' 'MULT' 'PROV' 'REST' 'STRD' nan] [+] SKEW -4.800071679091829
| c30 | |
|---|---|
| count | 143620 |
| unique | 6 |
| top | STRD |
| freq | 135921 |
col = 'c31'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Airframe hours of the aircraft [+] NAN_VALUES 94317 43.94 %
| c31 | |
|---|---|
| count | 120347.000000 |
| mean | 4946.355622 |
| std | 8731.752435 |
| min | 0.000000 |
| 25% | 1202.000000 |
| 50% | 2730.000000 |
| 75% | 4964.000000 |
| max | 99999.000000 |
numerical_histogram(col)
df[[col]].describe()
[+] SKEW 4.959558388999156
| c31 | |
|---|---|
| count | 120347.000000 |
| mean | 4946.355622 |
| std | 8731.752435 |
| min | 0.000000 |
| 25% | 1202.000000 |
| 50% | 2730.000000 |
| 75% | 4964.000000 |
| max | 99999.000000 |
Se observa que se trata de una caracteristica con un gran numero de datos átipicos pero con media dentro de la caja. Es por esto que se imputa los datos faltantes con la media de los atributos.
df[col].fillna(df[col].mean(), inplace=True)
df[[col]].describe()
| c31 | |
|---|---|
| count | 214664.000000 |
| mean | 4946.355622 |
| std | 6537.905618 |
| min | 0.000000 |
| 25% | 2393.000000 |
| 50% | 4946.355622 |
| 75% | 4946.355622 |
| max | 99999.000000 |
Se observa con la imputación de los valores que la desviación estandar se reduce así como la mediana y la moda coinciden. Más adelante se seguirá trabajando este atributo al tener un fuerte sesgo hacia la derecha.
col = 'c41'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Certificate type code of the pilot in command [+] NAN_VALUES 36718 17.1 %
| c41 | |
|---|---|
| count | 177946 |
| unique | 10 |
| top | 01 |
| freq | 66670 |
Al tratarse de una columna categórica, se realiza una codificación de los datos para poder ver la distribución de estos y poder decidir una técnica de imputación.
categorical_histogram(col, df, bins=10)
df[[col]].describe()
[+] VALUES_ENCODED ['00' '01' '02' '03' '09' 'F1' 'F3' 'F9' 'TT' 'XX' nan] [+] SKEW 0.8023001384208701
| c41 | |
|---|---|
| count | 177946 |
| unique | 10 |
| top | 01 |
| freq | 66670 |
# Imputación
df[col].fillna(df[col].mode()[0], inplace=True)
# Representación
categorical_histogram(col, df, bins=10)
df[[col]].describe()
[+] VALUES_ENCODED ['00' '01' '02' '03' '09' 'F1' 'F3' 'F9' 'TT' 'XX'] [+] SKEW 1.0494540577472309
| c41 | |
|---|---|
| count | 214664 |
| unique | 10 |
| top | 01 |
| freq | 103388 |
col = 'c49'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Qualification code of the pilot in command . [+] NAN_VALUES 58523 27.26 %
| c49 | |
|---|---|
| count | 156141 |
| unique | 14 |
| top | S |
| freq | 122943 |
categorical_histogram(col, df, bins=14)
df[[col]].describe()
[+] VALUES_ENCODED ['E' 'F' 'G' 'H' 'M' 'N' 'P' 'R' 'S' 'T' 'U' 'V' 'W' 'X' nan] [+] SKEW -1.8104273962780242
| c49 | |
|---|---|
| count | 156141 |
| unique | 14 |
| top | S |
| freq | 122943 |
# Imputación
df[col].fillna(df[col].mode()[0], inplace=True)
# Representación
categorical_histogram(col, df, bins=14)
df[[col]].describe()
[+] VALUES_ENCODED ['E' 'F' 'G' 'H' 'M' 'N' 'P' 'R' 'S' 'T' 'U' 'V' 'W' 'X'] [+] SKEW -2.34426759159736
| c49 | |
|---|---|
| count | 214664 |
| unique | 14 |
| top | S |
| freq | 181466 |
col = 'c56'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Total number of hours the pilot has flown. [+] NAN_VALUES 37048 17.26 %
| c56 | |
|---|---|
| count | 177616.000000 |
| mean | 3431.971410 |
| std | 5335.006461 |
| min | 0.000000 |
| 25% | 300.000000 |
| 50% | 1200.000000 |
| 75% | 4100.000000 |
| max | 92591.000000 |
numerical_histogram(col, df, bins=10)
[+] SKEW 2.8982753964902717
Existe un sesgo a la derecha cercano a 3, por lo que se decide realizar la imputación teniendo en cuenta la mediana.
# Imputación de datos
df[col].fillna(df[col].median(), inplace=True)
# Representación de datos
numerical_histogram(col, df, bins=10)
print(f'[+] IMPUTED\t{nan_values - df[[col]].isna().sum()[0]}')
df[[col]].describe()
[+] SKEW 3.2661643138319256 [+] IMPUTED 37048
| c56 | |
|---|---|
| count | 214664.000000 |
| mean | 3046.764404 |
| std | 4925.593126 |
| min | 0.000000 |
| 25% | 406.000000 |
| 50% | 1200.000000 |
| 75% | 3195.000000 |
| max | 92591.000000 |
col = 'c96'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Phase of flight code [+] NAN_VALUES 44967 20.95 %
| c96 | |
|---|---|
| count | 169697 |
| unique | 58 |
| top | HB |
| freq | 28203 |
categorical_histogram(col, df, bins=58)
df[[col]].describe()
[+] VALUES_ENCODED ['AA' 'AB' 'AC' 'AD' 'BA' 'BB' 'BC' 'BD' 'BE' 'BF' 'BG' 'BH' 'BI' 'DA' 'DB' 'DD' 'DE' 'DF' 'DG' 'DH' 'EX' 'FA' 'FB' 'FC' 'FD' 'FE' 'FF' 'FG' 'GX' 'HA' 'HB' 'HC' 'HD' 'HE' 'HF' 'HG' 'HH' 'HK' 'HM' 'IA' 'IB' 'IC' 'ID' 'IE' 'IF' 'IG' 'IH' 'JX' 'KA' 'KB' 'KC' 'KD' 'KE' 'KF' 'KG' 'PJ' 'PL' 'SR' nan] [+] SKEW 0.15724834154525713
| c96 | |
|---|---|
| count | 169697 |
| unique | 58 |
| top | HB |
| freq | 28203 |
# Imputación
df[col].fillna(df[col].mode()[0], inplace=True)
# Representación
categorical_histogram(col, df, bins=58)
df[[col]].describe()
[+] VALUES_ENCODED ['AA' 'AB' 'AC' 'AD' 'BA' 'BB' 'BC' 'BD' 'BE' 'BF' 'BG' 'BH' 'BI' 'DA' 'DB' 'DD' 'DE' 'DF' 'DG' 'DH' 'EX' 'FA' 'FB' 'FC' 'FD' 'FE' 'FF' 'FG' 'GX' 'HA' 'HB' 'HC' 'HD' 'HE' 'HF' 'HG' 'HH' 'HK' 'HM' 'IA' 'IB' 'IC' 'ID' 'IE' 'IF' 'IG' 'IH' 'JX' 'KA' 'KB' 'KC' 'KD' 'KE' 'KF' 'KG' 'PJ' 'PL' 'SR'] [+] SKEW -0.09581884131592443
| c96 | |
|---|---|
| count | 214664 |
| unique | 58 |
| top | HB |
| freq | 73170 |
col = 'c106'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Primary flying condition code [+] NAN_VALUES 42724 19.9 %
| c106 | |
|---|---|
| count | 171940 |
| unique | 3 |
| top | B |
| freq | 130636 |
categorical_histogram(col, df, bins=18)
[+] VALUES_ENCODED ['A' 'B' 'C' nan] [+] SKEW 0.2932921334971132
Al observar la distribución de los datos, claramente se observa que la característica B es la que más se extiende por el atributo, además, dado el numero de datos faltantes y los existentes para esta característica, se considera que imputar por el más frecuente se trata de una buena opción.
df[col].fillna(df[col].mode()[0], inplace=True)
df[[col]].isna().sum()
c106 0 dtype: int64
categorical_histogram(col, df, bins=18)
[+] VALUES_ENCODED ['A' 'B' 'C'] [+] SKEW 0.46765310508606434
Finalmente, tras la imputación, existe un ligero aumento de un sesgo a la derecha, de 0,2 puntos, que se considera no excesivo.
col = 'c108'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Secondary flying condition code [+] NAN_VALUES 42969 20.02 %
| c108 | |
|---|---|
| count | 171695 |
| unique | 23 |
| top | S |
| freq | 138179 |
categorical_histogram(col, df, bins=23)
[+] VALUES_ENCODED ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' nan] [+] SKEW -2.0978404805175357
Al observar la distribución de los datos, claramente se observa que la característica B es la que más se extiende por el atributo, además, dado el numero de datos faltantes y los existentes para esta característica, se considera que imputar por el más frecuente se trata de una buena opción.
df[col].fillna(df[col].mode()[0], inplace=True)
df[[col]].isna().sum()
c108 0 dtype: int64
categorical_histogram(col, df, bins=23)
[+] VALUES_ENCODED ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'] [+] SKEW -2.4835055731890923
Finalmente, tras la imputación, existe un ligero aumento de un sesgo a la derecha, de 0,2 puntos, que se considera no excesivo.
col = 'c240'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Wind direction [+] NAN_VALUES 141071 65.72 %
| c240 | |
|---|---|
| count | 73593.000000 |
| mean | 18.625384 |
| std | 10.855023 |
| min | 0.000000 |
| 25% | 10.000000 |
| 50% | 19.000000 |
| 75% | 27.000000 |
| max | 90.000000 |
numerical_histogram(col, df, bins=10)
[+] SKEW -0.1379627129535676
Teniendo en cuenta el alto numero de NaN de la columna, se realiza una limitación de la imputación. Se limitará la imputación a los valores NaN seguidos superior a la mitad de dichos datos faltantes.
Dado que el sesgo que existe es cercano a 0, se realiza una imputación teniendo en cuenta la media.
# Calculo del limite: Mitad del porcentaje de valores faltantes
limit = int(round(nan_values * (1 - (perc_nan / 2) / 100)))
print(f'[+] LIMIT\t{limit}')
[+] LIMIT 94715
# Imputación de datos
df[col].fillna(df[col].mean(), inplace=True, limit=limit)
# Representación de datos
numerical_histogram(col, df, bins=10)
print(f'[+] IMPUTED\t{nan_values - df[[col]].isna().sum()[0]}')
df[[col]].describe()
[+] SKEW -0.20863677953637016 [+] IMPUTED 94715
| c240 | |
|---|---|
| count | 168308.000000 |
| mean | 18.625384 |
| std | 7.177859 |
| min | 0.000000 |
| 25% | 18.625384 |
| 50% | 18.625384 |
| 75% | 18.625384 |
| max | 90.000000 |
col = 'c241'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Wind speed in miles per hours [+] NAN_VALUES 142012 66.16 %
| c241 | |
|---|---|
| count | 72652.000000 |
| mean | 8.814348 |
| std | 5.961852 |
| min | 0.000000 |
| 25% | 5.000000 |
| 50% | 8.000000 |
| 75% | 11.000000 |
| max | 99.000000 |
numerical_histogram(col, df, bins=10)
[+] SKEW 2.1958949925009055
Teniendo en cuenta que el 66% de los datos de la columna son NaN, se realiza una limitación de la imputación. Se limitará la imputación a los valores NaN seguidos superior a la mitad de dichos datos faltantes.
Teniendo en cuenra el sesgo existente se imputa los valores teniendo en cuenta la mediana.
# Calculo del limite: Mitad del porcentaje de valores faltantes
limit = int(round(nan_values * (1 - (perc_nan / 2) / 100)))
print(f'[+] LIMIT\t{limit}')
[+] LIMIT 95034
# Imputación de datos
df[col].fillna(df[col].median(), inplace=True, limit=limit)
# Representación de datos
numerical_histogram(col, df, bins=10)
print(f'[+] IMPUTED\t{nan_values - df[[col]].isna().sum()[0]}')
df[[col]].describe()
[+] SKEW 3.6314096651190324 [+] IMPUTED 95034
| c241 | |
|---|---|
| count | 167686.000000 |
| mean | 8.352826 |
| std | 3.944927 |
| min | 0.000000 |
| 25% | 8.000000 |
| 50% | 8.000000 |
| 75% | 8.000000 |
| max | 99.000000 |
col = 'c243'
print('[+] COL\t', col_name(col), '\n')
nan_values = df[[col]].isna().sum()[0]
perc_nan = round((nan_values/df.shape[0]) * 100, 2)
print(f'[+] NAN_VALUES\t{nan_values}\t{perc_nan} %')
df[[col]].describe()
[+] COL Gust speed in miles per hour [+] NAN_VALUES 204143 95.1 %
| c243 | |
|---|---|
| count | 10521.000000 |
| mean | 22.036689 |
| std | 9.139965 |
| min | 0.000000 |
| 25% | 15.000000 |
| 50% | 20.000000 |
| 75% | 26.000000 |
| max | 99.000000 |
numerical_histogram(col, df, bins=10)
[+] SKEW 1.128873924453862
Teniendo en cuenta el alto numero de NaN de la columna, se realiza una limitación de la imputación. Se limitará la imputación a los valores NaN seguidos superior a la mitad de dichos datos faltantes.
Dado que el sesgo que existe es pequeño, se realiza una imputación teniendo en cuenta la media.
# Calculo del limite: Mitad del porcentaje de valores faltantes
limit = int(round(nan_values * (1 - (perc_nan / 2) / 100)))
print(f'[+] LIMIT\t{limit}')
[+] LIMIT 107073
# Imputación de datos
df[col].fillna(df[col].mean(), inplace=True, limit=limit)
# Representación de datos
numerical_histogram(col, df, bins=10)
print(f'[+] IMPUTED\t{nan_values - df[[col]].isna().sum()[0]}')
df[[col]].describe()
[+] SKEW 3.773576229693608 [+] IMPUTED 107073
| c243 | |
|---|---|
| count | 117594.000000 |
| mean | 22.036689 |
| std | 2.733768 |
| min | 0.000000 |
| 25% | 22.036689 |
| 50% | 22.036689 |
| 75% | 22.036689 |
| max | 99.000000 |
Dada la relación de c110 y c10, y encontrandose en diferentes grupos, se realiza la comprobacion de sus NaN para ver cual se debiera imputar primero.
# Muestra de valores NaN para cada columna
related_cols = ['c110', 'c10']
for col in related_cols:
print(f'[+] NAN_VALUES\t{col}\t{col_name(col):<35}\t{df[[col]].isna().sum()[0]}')
[+] NAN_VALUES c110 Light condition code 46195 [+] NAN_VALUES c10 Local time of the accident/incident. 25668
Ya que c110 tiene un número mayor de valores faltantes, se imputa primero el grupo al que este pertenece, Estado del cielo, que Hora del incidente.
col = ['c20', 'c21']
print('[+] COL\t', col_name(col[0]), '&', col_name(col[1]), '\n')
print('[+] NAN_VALUES\t', df[[col[0]]].isna().sum()[0])
print('[+] NAN_VALUES\t', df[[col[1]]].isna().sum()[0])
df[col].describe()
[+] COL Latitude coordinates of the accident/incident & Longitude coordinates of the accident/incident [+] NAN_VALUES 156390 [+] NAN_VALUES 156415
| c20 | c21 | |
|---|---|---|
| count | 58274.000000 | 58249.000000 |
| mean | 39.303099 | -97.838415 |
| std | 9.263097 | 26.061591 |
| min | -163.866667 | -180.750000 |
| 25% | 33.866667 | -113.716667 |
| 50% | 38.966667 | -93.550000 |
| 75% | 42.616667 | -81.683333 |
| max | 90.916667 | 180.000000 |
print('[+] Latitude')
numerical_histogram(col[0], df, bins=5)
[+] Latitude
[+] SKEW -0.22389156785945905
print('[+] Longitude')
numerical_histogram(col[1], df, bins=5)
[+] Longitude
[+] SKEW 1.2204295481897933
Según los sesgos obtenidos, se realiza una imputación con la media para los valores de Latitud y de mediana para la Longitud ya que representa un sesgo mayor.
Para realizar la imputación de los valores perdidos de latitud y longitud, primero se agrupa los datos según las columnas para estado, ciudad, region y distrito. Pero para saber en que orden se ha de realizar la agrupación, se extrae el total de valores unicos de cada columna, de modo que podamos agrupar de más grande a más pequeño.
df[['c11', 'c12', 'c13', 'c14']].nunique()
c11 13 c12 270 c13 170 c14 19418 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de Region(c11)/Estado(c13)/Ciudad(c14). Tras analizar los datos que se tiene, se decide eliminar de la agrupación la columna c12, relativa a la region, porque no aporta una información tan clara de su significado para la agrupación como si las otras columnas.
# Crear los grupos en orden segun: Region - Estado - Ciudad
groups = df.groupby(['c11', 'c13', 'c14'], group_keys=False, dropna=True)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.mean(numeric_only=True)[col]
[+] TOTAL_GROUPS 28249
| c20 | c21 | |||
|---|---|---|---|---|
| c11 | c13 | c14 | ||
| AL | AB | BONNYVILLE | 53.916667 | NaN |
| AK | ACKERMAN LAKE | NaN | NaN | |
| ADAK ISLAND | NaN | NaN | ||
| ADMIRALTY ISLAN | NaN | NaN | ||
| AFOGNAK ISL | NaN | NaN | ||
| ... | ... | ... | ... | ... |
| WP | ZZ | MAJURO MARSHALL | -0.016667 | 164.716667 |
| OSAKA JAPAN | 34.433333 | 135.233333 | ||
| PARDUBICE | 50.000000 | 15.733333 | ||
| SALTILLO | NaN | NaN | ||
| TOKYO | 36.033333 | 14.166667 |
28249 rows × 2 columns
Una vez se tiene el orden de los grupos, se obtiene un total de 28000 grupos, sobre estos grupos a continuación se realizará una imputación por interpolación.
# Aplicar la imputación a las columnas c20 y c21 de cada grupo
df[[col[0]]] = groups[[col[0]]].transform(lambda group: group.fillna(group.mean()))
df[[col[1]]] = groups[[col[1]]].transform(lambda group: group.fillna(group.median()))
# Representar datos
print('[+] Latitude')
numerical_histogram(col[0], df, bins=5)
print('[+] Longitude')
numerical_histogram(col[1], df, bins=5)
df[col].describe()
[+] Latitude
[+] SKEW 0.31415690404159635 [+] Longitude
[+] SKEW 0.41264274114246563
| c20 | c21 | |
|---|---|---|
| count | 184376.000000 | 184365.000000 |
| mean | 38.606490 | -97.558997 |
| std | 8.025867 | 22.163640 |
| min | -163.866667 | -180.750000 |
| 25% | 33.735714 | -112.066667 |
| 50% | 38.682203 | -93.300000 |
| 75% | 42.016667 | -82.050000 |
| max | 90.916667 | 180.000000 |
Tras la imputación, se observa que el sesgo se ha conseguido reducir hasta 1 punto en el caso de la Longitud, mientras que la Latitud se ha desplazado hacia la derecha.
Para estas relaciones, primero se evaluará cuál de las columnas relacionadas contiene un mayor número de datos NaN para imputar sus datos en relación a las otras.
# Muestra de valores NaN para cada columna
related_cols = ['c110', 'c112', 'c113', 'c114', 'c115']
for col in related_cols:
print(f'[+] NAN_VALUES\t{col}\t{col_name(col):<35}\t{df[[col]].isna().sum()[0]}')
[+] NAN_VALUES c110 Light condition code 46195 [+] NAN_VALUES c112 Sky condition code 124369 [+] NAN_VALUES c113 Cloud ceiling 194862 [+] NAN_VALUES c114 Visibility code 126446 [+] NAN_VALUES c115 Visibility restriction code. 196487
A continuación se imputará los valores a las columnas en función de los resultados obtenidos.
col = 'c115'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Visibility restriction code. [+] NAN_VALUES 196487
| c115 | |
|---|---|
| count | 18177 |
| unique | 8 |
| top | HAZ |
| freq | 6992 |
categorical_histogram(col, df, bins=8)
[+] VALUES_ENCODED ['BLS' 'DRZ' 'DST' 'FOG' 'HAZ' 'RAI' 'SMK' 'SNO' nan] [+] SKEW 0.003743929340825541
Para la codificación de estos datos, del mismo modo que con las columnas anteriores, existe una relación con una serie de columnas, en concreto con:
df[['c112', 'c110', 'c114']].nunique()
c112 5 c110 5 c114 12 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de c110/c112/c114
# Convertir la columna a numérica para realizar la imputación
# Instancia del codificador
oe = OrdinalEncoder()
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
# Crear un dataframe auxiliar con los atributos y valores transformados
df[col] = col_encoded
# Crear los grupos en orden segun: Codigo de luz - Estado del cielo - Restriccion de visión
groups = df.groupby(['c110', 'c112', 'c114'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.mean(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 372
| c115 | |||
|---|---|---|---|
| c110 | c112 | c114 | |
| 1 | BRKN | 1 | 4.020202 |
| 10 | 4.648148 | ||
| 2 | 4.008511 | ||
| 3 | 4.027668 | ||
| 4 | 4.067729 | ||
| ... | ... | ... | ... |
| NaN | NaN | 7 | NaN |
| 9 | NaN | ||
| G10 | 7.000000 | ||
| LS1 | 1.500000 | ||
| NaN | 3.606299 |
372 rows × 1 columns
Para la imputación de valores, se observa que los datos de la columna describen situaciones de baja visibilidad. Se entiendo que muchos estos datos NaN puede ser que sea fruto de un error de tipado o no haberlos recogidos, pero que tambien parte de ellos, quizá se deba a situaciones de buena visibilidad.
Dados los agrupaminetos, se considera que aquellas columnas cuya condicion de cielo (c112) sea CLER, BRKN o SCAT y la visibilidad (c114) mayor o igual a 6, el código de restricción sera "NONE".
El resto de los valores, por el sesgo que existe, se imputará la media.
# Obtener las categorias codificadas y añadir un indice para 'NONE'
categories = list(oe.categories_[0])
# Añadir valores al diccionario
dic_categories = {idx: cat for idx, cat in enumerate(categories)}
dic_categories.update({len(categories): 'NONE'})
dic_categories
{0: 'BLS',
1: 'DRZ',
2: 'DST',
3: 'FOG',
4: 'HAZ',
5: 'RAI',
6: 'SMK',
7: 'SNO',
8: nan,
9: 'NONE'}
def impute_restriction(value):
"""
Imputar la restricción de acuerdo con la lógica mencionada
"""
c112_values = ['CLER', 'BRKN', 'SCAT']
c114_limit = [str(i) for i in range(6, 10)]
c114_limit.append('G10')
value_input = 9 # Valor de NONE
# Valores que tiene los indices del grupo
group_values = [str(val) for val in value.name]
if group_values[1] in c112_values and group_values[2] in c114_limit:
return value.fillna(value_input)
return value
df[[col]] = groups[[col]].apply(impute_restriction)
# Devolver valores numericos a su categórico
df[[col]] = df[[col]].applymap(lambda x: dic_categories.get(x, np.nan))
# Presentar datos
categorical_histogram(col, df, bins=8)
df[[col]].describe()
[+] VALUES_ENCODED ['BLS' 'DRZ' 'DST' 'FOG' 'HAZ' 'NONE' 'RAI' 'SMK' 'SNO' nan] [+] SKEW -0.8350355000749172
| c115 | |
|---|---|
| count | 68213 |
| unique | 9 |
| top | NONE |
| freq | 50036 |
col = 'c113'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Cloud ceiling [+] NAN_VALUES 194862
| c113 | |
|---|---|
| count | 19802.000000 |
| mean | 3043.249571 |
| std | 2674.987580 |
| min | 0.000000 |
| 25% | 800.000000 |
| 50% | 2500.000000 |
| 75% | 4500.000000 |
| max | 9999.000000 |
numerical_histogram(col, df, bins=5)
[+] SKEW 0.9894317762650485
Teniendo en cuenta el significado de la columna, esta tiene una relacion con la columna c112, condiciones del cielo, pues sus datos representan el estado de las nubes en el momento del incidente. Además, también se tiene en cuenta las columnas de visibilidad, la c114 para la distancia de visión y c115 para el tipo de restricción. Dada esta relación, se agrupa el dataframe y se imputa los valores.
df[['c112', 'c114', 'c115']].nunique()
c112 5 c114 12 c115 9 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de Condición del cierlo (c112)/Restricción(c115)/Distancia visión(c114).
# Crear los grupos en orden segun: Condición del cielo - Restriccion de vision - Distancia de visión
groups = df.groupby(['c112', 'c115', 'c114'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.median(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 533
| c113 | |||
|---|---|---|---|
| c112 | c115 | c114 | |
| BRKN | BLS | 1 | 1300.0 |
| 10 | 758.0 | ||
| 2 | 1200.0 | ||
| 3 | 2500.0 | ||
| 4 | 8000.0 | ||
| ... | ... | ... | ... |
| NaN | NaN | 8 | 4250.0 |
| 9 | 4750.0 | ||
| G10 | 3500.0 | ||
| LS1 | NaN | ||
| NaN | 38.0 |
533 rows × 1 columns
Para la imputación, se realiza teniendo en cuenta la mediana del atributo para cada uno de los códigos de la distancia de visión.
Dado el sesgo que existe, se opta por realizar una imputación teniendo en cuenta la mediana.
def impute_ceiling(value):
"""
Imputar cloud ceiling de acuerdo con la lógica mencionada
"""
median = value.median()
return value.fillna(median)
# Aplicar la imputación
df[[col]] = groups[[col]].apply(impute_ceiling)
# Representar datos
numerical_histogram(col, df, bins=5)
df[[col]].describe()
[+] SKEW 1.5290207984601834
| c113 | |
|---|---|
| count | 211245.000000 |
| mean | 1246.399465 |
| std | 1750.345638 |
| min | 0.000000 |
| 25% | 38.000000 |
| 50% | 38.000000 |
| 75% | 2500.000000 |
| max | 9999.000000 |
Tras aplicar la imputación, se ha conseguido imputar un gran numero de datos faltantes manteniendose además la distribución de un modo similar, aunque generando un sesgo hacia la derecha.
col = 'c114'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Visibility code [+] NAN_VALUES 126446
| c114 | |
|---|---|
| count | 88218 |
| unique | 12 |
| top | G10 |
| freq | 48211 |
categorical_histogram(col, df, bins=12)
[+] VALUES_ENCODED ['1' '10' '2' '3' '4' '5' '6' '7' '8' '9' 'G10' 'LS1' nan] [+] SKEW -0.8669808518337043
Para la codificación de estos datos, del mismo modo que con las columnas anteriores, existe una relación con una serie de columnas, en concreto con el estado del cielo (c112), el código de la luz (c110) y la restriccion de la visión (c115).
df[['c112', 'c110', 'c115', 'c10']].nunique()
c112 5 c110 5 c115 9 c10 24 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de c110/c112/c115
# Convertir la columna a numérica para realizar la imputación
# Instancia del codificador
oe = OrdinalEncoder()
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
# Crear un dataframe auxiliar con los atributos y valores transformados
df[col] = col_encoded
# Crear los grupos en orden segun: Codigo de luz - Estado del cielo - Restriccion de visión
groups = df.groupby(['c110', 'c112', 'c115', 'c10'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.mean(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 2275
| c114 | ||||
|---|---|---|---|---|
| c110 | c112 | c115 | c10 | |
| 1 | BRKN | BLS | 08 | 2.000000 |
| 09 | 0.500000 | |||
| 10 | 4.333333 | |||
| 11 | 5.500000 | |||
| 13 | 4.000000 | |||
| ... | ... | ... | ... | ... |
| NaN | NaN | NaN | 20 | 5.250000 |
| 21 | 1.000000 | |||
| 22 | NaN | |||
| 23 | NaN | |||
| NaN | 1.000000 |
2275 rows × 1 columns
Por el sesgo cercano a la unidad, se decide realizar una imputación teniendo en cuenta la mediana.
# Aplicar la imputación
df[[col]] = groups[[col]].transform(lambda group: group.fillna(group.median()))
# df[[col]] = groups[[col]].transform(lambda group: group.fillna(round(group.mean(), 0)))
# Devolver los valores a su categoria
df[[col]] = oe.inverse_transform(df[[col]])
# Presentar datos
categorical_histogram(col, df, bins=12)
df[[col]].describe()
[+] VALUES_ENCODED ['1' '10' '2' '3' '4' '5' '6' '7' '8' '9' 'G10' 'LS1' nan] [+] SKEW -0.9852153692695964
| c114 | |
|---|---|
| count | 189679 |
| unique | 12 |
| top | G10 |
| freq | 101138 |
Tras la imputación, se mantiene el sesgo hacia la izquierda.
col = 'c112'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Sky condition code [+] NAN_VALUES 124369
| c112 | |
|---|---|
| count | 90295 |
| unique | 5 |
| top | CLER |
| freq | 40716 |
categorical_histogram(col, df, bins=5)
[+] VALUES_ENCODED ['BRKN' 'CLER' 'OBSC' 'OCST' 'SCAT' nan] [+] SKEW 0.4390215717801383
Teniendo en cuenta el significado de la columna, esta tiene una relacion con las columnas de vibilidad, la c114 para la distancia de visión y c115 para el tipo de restricción. Dada esta relación, se agrupa el dataframe y se imputa los valores
df[['c114', 'c115']].nunique()
c114 12 c115 9 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de Restricción(c115)/Distancia visión(c114).
# Convertir la columna a numérica para realizar la imputación
# Instancia del codificador
oe = OrdinalEncoder()
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
# Crear un dataframe auxiliar con los atributos y valores transformados
df[col] = col_encoded
# Crear los grupos en orden segun: Restriccion de vision - Distancia de visión
groups = df.groupby(['c115', 'c114'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.median(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 119
| c112 | ||
|---|---|---|
| c115 | c114 | |
| BLS | 1 | 2.0 |
| 10 | 3.0 | |
| 2 | 3.0 | |
| 3 | 3.0 | |
| 4 | 2.0 | |
| ... | ... | ... |
| NaN | 8 | 3.0 |
| 9 | 3.0 | |
| G10 | 3.0 | |
| LS1 | 3.0 | |
| NaN | 1.0 |
119 rows × 1 columns
# Aplicar la imputación
df[[col]] = groups[[col]].transform(lambda group: group.fillna(group.median()))
# Devolver los valores a su categoria
df[[col]] = oe.inverse_transform(df[[col]])
categorical_histogram(col, df, bins=5)
df[[col]].describe()
[+] VALUES_ENCODED ['BRKN' 'CLER' 'OBSC' 'OCST' 'SCAT' nan] [+] SKEW -0.1135850270264526
| c112 | |
|---|---|
| count | 214653 |
| unique | 5 |
| top | OCST |
| freq | 94337 |
Tras la imputación se observa que el sesgo que existía, se ha reducido en 0.2 puntos.
col = 'c110'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Light condition code [+] NAN_VALUES 46195
| c110 | |
|---|---|
| count | 168469 |
| unique | 5 |
| top | 1 |
| freq | 140457 |
categorical_histogram(col, df, bins=5)
[+] VALUES_ENCODED ['1' '2' '3' '4' '5' nan] [+] SKEW 3.2817021176777317
Para la imputación, se agrupa el dataframe por las horas del accidente, se tiene en cuenta esta vez la hora del accidente que previamente ha sido imnputado de acuerdo con esta columna.
Debido al sesgo que existe en los valores, se realiza una imputación teniendo en cuenta la mediana.
df[['c10', 'c112', 'c114', 'c115']].nunique()
c10 24 c112 5 c114 12 c115 9 dtype: int64
# Convertir la columna a numérica para realizar la imputación
# Instancia del codificador
oe = OrdinalEncoder()
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
# Crear un dataframe auxiliar con los atributos y valores transformados
df[col] = col_encoded
# Crear los grupos en orden según la fase del dia
groups = df.groupby(['c112', 'c115', 'c114', 'c10'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.median(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 4398
| c110 | ||||
|---|---|---|---|---|
| c112 | c115 | c114 | c10 | |
| BRKN | BLS | 1 | 09 | 0.0 |
| 14 | 0.0 | |||
| 18 | 0.5 | |||
| 10 | 09 | 0.0 | ||
| 14 | 0.0 | |||
| ... | ... | ... | ... | ... |
| NaN | DST | NaN | 12 | 0.0 |
| 14 | 0.0 | |||
| 15 | 0.0 | |||
| 18 | 0.5 | |||
| 20 | 1.0 |
4398 rows × 1 columns
# Aplicar la imputación para cada grupo
df[[col]] = groups[[col]].transform(lambda group: group.fillna(group.median()))
# Devolver los valores a su categoria
df[[col]] = oe.inverse_transform(df[[col]])
# Representacion de los datos
categorical_histogram(col, df, bins=5)
df[[col]].describe(include='all')
[+] VALUES_ENCODED ['1' '2' '3' '4' '5' nan] [+] SKEW 2.2781889265658224
| c110 | |
|---|---|
| count | 214216 |
| unique | 5 |
| top | 1 |
| freq | 161200 |
Tras la imputación, se ha bajado el sesgo en más de 1 punto.
Tras las imputaciones, se vuelve a comprobar el numero de valores faltantes dentro del grupo, ya que existe varias relaciones y por lo tanto, podría volver a aplicarse la imputación en caso de ser necesario.
# Muestra de valores NaN para cada columna
related_cols = ['c110', 'c112', 'c113', 'c114', 'c115']
for col in related_cols:
print(f'[+] NAN_VALUES\t{col}\t{col_name(col):<35}\t{df[[col]].isna().sum()[0]}')
[+] NAN_VALUES c110 Light condition code 448 [+] NAN_VALUES c112 Sky condition code 11 [+] NAN_VALUES c113 Cloud ceiling 3419 [+] NAN_VALUES c114 Visibility code 24985 [+] NAN_VALUES c115 Visibility restriction code. 146451
Si comparamos con los primeros valores, se ha reducido en gran medida el numero de datos faltantes, no obstante, la columna c115 y c114, al haber sido las primera en imputar, sigue teniendo muchos datos faltantes, por lo que, se decide volver a realizar la imputación, volviendo a seguir la lógica de mayor a menor número de datos faltantes.
col = 'c115'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Visibility restriction code. [+] NAN_VALUES 146451
| c115 | |
|---|---|
| count | 68213 |
| unique | 9 |
| top | NONE |
| freq | 50036 |
categorical_histogram(col, df, bins=9)
[+] VALUES_ENCODED ['BLS' 'DRZ' 'DST' 'FOG' 'HAZ' 'NONE' 'RAI' 'SMK' 'SNO' nan] [+] SKEW -0.8350355000749172
Para la codificación de estos datos, del mismo modo que con las columnas anteriores, existe una relación con una serie de columnas, en concreto con:
df[['c112', 'c110', 'c114']].nunique()
c112 5 c110 5 c114 12 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de c110/c112/c114
# Convertir la columna a numérica para realizar la imputación
# Instancia del codificador
oe = OrdinalEncoder()
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
# Crear un dataframe auxiliar con los atributos y valores transformados
df[col] = col_encoded
# Crear los grupos en orden segun: Codigo de luz - Estado del cielo - Restriccion de visión
groups = df.groupby(['c110', 'c112', 'c114'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.mean(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 287
| c115 | |||
|---|---|---|---|
| c110 | c112 | c114 | |
| 1 | BRKN | 1 | 4.390000 |
| 10 | 5.256881 | ||
| 2 | 4.389121 | ||
| 3 | 4.409266 | ||
| 4 | 4.344828 | ||
| ... | ... | ... | ... |
| NaN | SCAT | 4 | 2.500000 |
| G10 | 4.000000 | ||
| LS1 | NaN | ||
| NaN | 4.000000 | ||
| NaN | NaN | 2.000000 |
287 rows × 1 columns
def impute_restriction(value):
"""
Imputar la restricción de acuerdo con la lógica mencionada
"""
c112_min_restriction = ['CLER', 'BRKN', 'SCAT', np.nan]
c112_max_restriction = ['OSCT', 'OBSC']
c114_min_limit = [str(i) for i in range(3, 10)]
c114_max_limit = [str(i) for i in range(6, 10)]
c114_min_limit.append('G10')
c114_max_limit.append('G10')
value_input = 9 # Valor de NONE
# Valores que tiene los indices del grupo
group_values = [str(val) for val in value.name]
# Restricción de vision superior a 3 millas o 6 millas
res_min = group_values[1] in c112_min_restriction and group_values[2] in c114_min_limit
res_max = group_values[1] in c112_max_restriction and group_values[2] in c114_max_limit
if res_min or res_max:
return value.fillna(value_input)
return value
df[[col]] = groups[[col]].apply(impute_restriction)
# Devolver los valores a su categoria
df[[col]] = oe.inverse_transform(df[[col]])
# Presentar datos
categorical_histogram(col, df, bins=8)
df[[col]].describe()
[+] VALUES_ENCODED ['BLS' 'DRZ' 'DST' 'FOG' 'HAZ' 'NONE' 'RAI' 'SMK' 'SNO' nan] [+] SKEW -0.8350355000749172
| c115 | |
|---|---|
| count | 68213 |
| unique | 9 |
| top | NONE |
| freq | 50036 |
def impute_restriction_other(value):
"""
Imputar la restricción de acuerdo con la lógica mencionada
"""
c114_limit = [str(i) for i in range(1, 3)]
c114_limit.append('LS1')
value_input = 'OTH' # Valor de Others
# Valores que tiene los indices del grupo
group_values = [str(val) for val in value.name]
if group_values[2] in c114_limit:
return value.fillna(value_input)
return value
df[[col]] = groups[[col]].apply(impute_restriction_other)
# Presentar datos
categorical_histogram(col, df, bins=8)
df[[col]].describe()
[+] VALUES_ENCODED ['BLS' 'DRZ' 'DST' 'FOG' 'HAZ' 'NONE' 'OTH' 'RAI' 'SMK' 'SNO' nan] [+] SKEW 0.30963066192953925
| c115 | |
|---|---|
| count | 68568 |
| unique | 10 |
| top | NONE |
| freq | 50036 |
# Crear los grupos en orden segun: Codigo de luz - Estado del cielo - Restriccion de visión
groups = df.groupby(['c112', 'c114'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups[[col]].count()
[+] TOTAL_GROUPS 66
| c115 | ||
|---|---|---|
| c112 | c114 | |
| BRKN | 1 | 147 |
| 10 | 145 | |
| 2 | 348 | |
| 3 | 360 | |
| 4 | 345 | |
| ... | ... | ... |
| SCAT | 9 | 121 |
| G10 | 11225 | |
| LS1 | 27 | |
| NaN | 148 | |
| NaN | NaN | 11 |
66 rows × 1 columns
def impute_restriction(value):
"""
Imputar la restricción de acuerdo con la lógica mencionada
"""
c112_min_rest = ['CLER', 'BRKN', 'SCAT', np.nan]
c112_max_rest = ['OSCT', 'OBSC']
c114_min_limit = [str(i) for i in range(3, 10)]
c114_max_limit = [str(i) for i in range(6, 10)]
c114_oth_limit = [str(i) for i in range(1, 3)]
c114_min_limit.append('G10')
c114_max_limit.append('G10')
c114_oth_limit.append('LS1')
value_oth = 'OTH' # Valor de Others
value_none = 'NONE' # Valor de NONE
# Valores que tiene los indices del grupo
group_values = [str(val) for val in value.name]
# Valores que tiene los indices del grupo
group_values = [str(val) for val in value.name]
# Restricción de vision superior a 3 millas o 6 millas
res_min = group_values[0] in c112_min_rest and group_values[1] in c114_min_limit
res_max = group_values[0] in c112_max_rest and group_values[1] in c114_max_limit
# Restricción inferior a o igual a 2 millas
res_oth = group_values[1] in c114_oth_limit
if res_oth:
return value.fillna(value_oth)
elif res_min or res_max:
return value.fillna(value_none)
return value
df[[col]] = groups[[col]].apply(impute_restriction)
# Presentar datos
categorical_histogram(col, df, bins=8)
df[[col]].describe()
[+] VALUES_ENCODED ['BLS' 'DRZ' 'DST' 'FOG' 'HAZ' 'NONE' 'OTH' 'RAI' 'SMK' 'SNO' nan] [+] SKEW 0.30148007599567717
| c115 | |
|---|---|
| count | 73821 |
| unique | 10 |
| top | NONE |
| freq | 55289 |
Tras esta nueva imputación, se ha conseguido imputar nuevos valores, pero, sigue faltando muchos de ellos.
col = 'c114'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Visibility code [+] NAN_VALUES 24985
| c114 | |
|---|---|
| count | 189679 |
| unique | 12 |
| top | G10 |
| freq | 101138 |
categorical_histogram(col, df, bins=12)
[+] VALUES_ENCODED ['1' '10' '2' '3' '4' '5' '6' '7' '8' '9' 'G10' 'LS1' nan] [+] SKEW -0.9852153692695964
Para la codificación de estos datos, del mismo modo que con las columnas anteriores, existe una relación con una serie de columnas, en concreto con el estado del cielo (c112), el código de la luz (c110) y la restriccion de la visión (c115).
df[['c112', 'c110', 'c115', 'c10']].nunique()
c112 5 c110 5 c115 10 c10 24 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de c110/c112/c115
# Convertir la columna a numérica para realizar la imputación
# Instancia del codificador
oe = OrdinalEncoder()
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
# Crear un dataframe auxiliar con los atributos y valores transformados
df[col] = col_encoded
# Crear los grupos en orden segun: Codigo de luz - Estado del cielo - Restriccion de visión
groups = df.groupby(['c110', 'c112', 'c115', 'c10'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.mean(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 1905
| c114 | ||||
|---|---|---|---|---|
| c110 | c112 | c115 | c10 | |
| 1 | BRKN | BLS | 08 | 2.000000 |
| 09 | 0.333333 | |||
| 10 | 4.333333 | |||
| 11 | 5.500000 | |||
| 13 | 4.000000 | |||
| ... | ... | ... | ... | ... |
| NaN | SCAT | HAZ | 06 | 4.000000 |
| NaN | 6.000000 | |||
| OTH | 08 | 11.000000 | ||
| SMK | 22 | NaN | ||
| NaN | DST | 07 | NaN |
1905 rows × 1 columns
Por el sesgo cercano a la unidad, se decide realizar una imputación teniendo en cuenta la mediana.
# Aplicar la imputación
df[[col]] = groups[[col]].transform(lambda group: group.fillna(group.median()))
# df[[col]] = groups[[col]].transform(lambda group: group.fillna(round(group.mean(), 0)))
# Devolver los valores a su categoria
df[[col]] = oe.inverse_transform(df[[col]])
# Presentar datos
categorical_histogram(col, df, bins=12)
df[[col]].describe()
[+] VALUES_ENCODED ['1' '10' '2' '3' '4' '5' '6' '7' '8' '9' 'G10' 'LS1' nan] [+] SKEW -0.9068290245984301
| c114 | |
|---|---|
| count | 193891 |
| unique | 12 |
| top | G10 |
| freq | 101139 |
Tras aplicar la reimputación, se ha conseguido imputar más valores.
col = 'c113'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Cloud ceiling [+] NAN_VALUES 3419
| c113 | |
|---|---|
| count | 211245.000000 |
| mean | 1246.399465 |
| std | 1750.345638 |
| min | 0.000000 |
| 25% | 38.000000 |
| 50% | 38.000000 |
| 75% | 2500.000000 |
| max | 9999.000000 |
numerical_histogram(col, df, bins=5)
[+] SKEW 1.5290207984601834
Teniendo en cuenta el significado de la columna, esta tiene una relacion con la columna c112, condiciones del cielo, pues sus datos representan el estado de las nubes en el momento del incidente. Además, también se tiene en cuenta las columnas de visibilidad, la c114 para la distancia de visión y c115 para el tipo de restricción. Dada esta relación, se agrupa el dataframe y se imputa los valores.
df[['c112', 'c114', 'c115']].nunique()
c112 5 c114 12 c115 10 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de Condición del cierlo (c112)/Restricción(c115)/Distancia visión(c114).
# Crear los grupos en orden segun: Condición del cielo - Restriccion de vision - Distancia de visión
groups = df.groupby(['c112', 'c115', 'c114'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.median(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 443
| c113 | |||
|---|---|---|---|
| c112 | c115 | c114 | |
| BRKN | BLS | 1 | 1200.0 |
| 10 | 758.0 | ||
| 2 | 1200.0 | ||
| 3 | 2500.0 | ||
| 4 | 8000.0 | ||
| ... | ... | ... | ... |
| SCAT | SNO | G10 | NaN |
| LS1 | 1001.0 | ||
| NaN | 10 | 2500.0 | |
| NaN | 10.0 | ||
| NaN | DST | NaN | NaN |
443 rows × 1 columns
Para la imputación, se realiza teniendo en cuenta la mediana del atributo para cada uno de los códigos de la distancia de visión.
Dado el sesgo que existe, se opta por realizar una imputación teniendo en cuenta la mediana.
def impute_ceiling(value):
"""
Imputar cloud ceiling de acuerdo con la lógica mencionada
"""
median = value.median()
return value.fillna(median)
# Aplicar la imputación
df[[col]] = groups[[col]].apply(impute_ceiling)
# Representar datos
numerical_histogram(col, df, bins=5)
df[[col]].describe()
[+] SKEW 1.5233162862038736
| c113 | |
|---|---|
| count | 212495.000000 |
| mean | 1250.626697 |
| std | 1746.859609 |
| min | 0.000000 |
| 25% | 38.000000 |
| 50% | 38.000000 |
| 75% | 2500.000000 |
| max | 9999.000000 |
Tras aplicar la reimputación, se ha conseguido imputar más valores.
col = 'c110'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Light condition code [+] NAN_VALUES 448
| c110 | |
|---|---|
| count | 214216 |
| unique | 5 |
| top | 1 |
| freq | 161200 |
categorical_histogram(col, df, bins=5)
[+] VALUES_ENCODED ['1' '2' '3' '4' '5' nan] [+] SKEW 2.2781889265658224
Para la imputación, se agrupa el dataframe por las horas del accidente, se tiene en cuenta esta vez la hora del accidente que previamente ha sido imnputado de acuerdo con esta columna.
Debido al sesgo que existe en los valores, se realiza una imputación teniendo en cuenta la mediana.
df[['c10', 'c112', 'c114', 'c115']].nunique()
c10 24 c112 5 c114 12 c115 10 dtype: int64
# Convertir la columna a numérica para realizar la imputación
# Instancia del codificador
oe = OrdinalEncoder()
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
# Crear un dataframe auxiliar con los atributos y valores transformados
df[col] = col_encoded
# Crear los grupos en orden según la fase del dia
groups = df.groupby(['c112', 'c115', 'c114', 'c10'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.median(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 4358
| c110 | ||||
|---|---|---|---|---|
| c112 | c115 | c114 | c10 | |
| BRKN | BLS | 1 | 09 | 0.0 |
| 14 | 0.0 | |||
| 18 | 0.5 | |||
| 10 | 09 | 0.0 | ||
| 14 | 0.0 | |||
| ... | ... | ... | ... | ... |
| NaN | DST | NaN | 12 | 0.0 |
| 14 | 0.0 | |||
| 15 | 0.0 | |||
| 18 | 0.5 | |||
| 20 | 1.0 |
4358 rows × 1 columns
# Aplicar la imputación para cada grupo
df[[col]] = groups[[col]].transform(lambda group: group.fillna(group.median()))
# Devolver los valores a su categoria
df[[col]] = oe.inverse_transform(df[[col]])
# Representacion de los datos
categorical_histogram(col, df, bins=5)
df[[col]].describe(include='all')
[+] VALUES_ENCODED ['1' '2' '3' '4' '5' nan] [+] SKEW 2.2781889265658224
| c110 | |
|---|---|
| count | 214216 |
| unique | 5 |
| top | 1 |
| freq | 161200 |
Tras la imputación, se ha bajado el sesgo en más de 1 punto.
col = 'c112'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe()
[+] COL Sky condition code [+] NAN_VALUES 11
| c112 | |
|---|---|
| count | 214653 |
| unique | 5 |
| top | OCST |
| freq | 94337 |
categorical_histogram(col, df, bins=5)
[+] VALUES_ENCODED ['BRKN' 'CLER' 'OBSC' 'OCST' 'SCAT' nan] [+] SKEW -0.1135850270264526
Teniendo en cuenta el significado de la columna, esta tiene una relacion con las columnas de vibilidad, la c114 para la distancia de visión y c115 para el tipo de restricción. Dada esta relación, se agrupa el dataframe y se imputa los valores
df[['c114', 'c115']].nunique()
c114 12 c115 10 dtype: int64
Teniendo en cuenta los resultados, se realiza el agrupamiento por orden de Restricción(c115)/Distancia visión(c114).
# Convertir la columna a numérica para realizar la imputación
# Instancia del codificador
oe = OrdinalEncoder()
oe.fit(df[[col]])
col_encoded = oe.transform(df[[col]])
# Crear un dataframe auxiliar con los atributos y valores transformados
df[col] = col_encoded
# Crear los grupos en orden segun: Restriccion de vision - Distancia de visión
groups = df.groupby(['c115', 'c114'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.median(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 122
| c112 | ||
|---|---|---|
| c115 | c114 | |
| BLS | 1 | 2.0 |
| 10 | 3.0 | |
| 2 | 3.0 | |
| 3 | 3.0 | |
| 4 | 2.0 | |
| ... | ... | ... |
| NaN | 7 | 3.0 |
| 8 | 3.0 | |
| 9 | 3.0 | |
| G10 | 3.0 | |
| NaN | 1.0 |
122 rows × 1 columns
# Aplicar la imputación
df[[col]] = groups[[col]].transform(lambda group: group.fillna(group.median()))
# Devolver los valores a su categoria
df[[col]] = oe.inverse_transform(df[[col]])
categorical_histogram(col, df, bins=5)
df[[col]].describe()
[+] VALUES_ENCODED ['BRKN' 'CLER' 'OBSC' 'OCST' 'SCAT' nan] [+] SKEW -0.1135850270264526
| c112 | |
|---|---|
| count | 214653 |
| unique | 5 |
| top | OCST |
| freq | 94337 |
Tras la imputación se observa que el sesgo que existía, se ha reducido en 0.2 puntos.
# Muestra de valores NaN para cada columna
related_cols = ['c110', 'c112', 'c113', 'c114', 'c115']
for col in related_cols:
print(f'[+] NAN_VALUES\t{col}\t{col_name(col):<35}\t{df[[col]].isna().sum()[0]}')
[+] NAN_VALUES c110 Light condition code 448 [+] NAN_VALUES c112 Sky condition code 11 [+] NAN_VALUES c113 Cloud ceiling 2169 [+] NAN_VALUES c114 Visibility code 20773 [+] NAN_VALUES c115 Visibility restriction code. 140843
col = 'c10'
print('[+] COL\t', col_name(col), '\n')
print('[+] NAN_VALUES\t', df[[col]].isna().sum()[0])
df[[col]].describe(include='all')
[+] COL Local time of the accident/incident. [+] NAN_VALUES 25668
| c10 | |
|---|---|
| count | 188996 |
| unique | 24 |
| top | 15 |
| freq | 16812 |
categorical_histogram(col, df)
df[[col]].describe()
[+] VALUES_ENCODED ['00' '01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '20' '21' '22' '23' nan] [+] SKEW -0.32542124023620517
| c10 | |
|---|---|
| count | 188996 |
| unique | 24 |
| top | 15 |
| freq | 16812 |
Para la imputación, se tiene en cuenta la condición luminica del cielo, c110, ya que sus datos indican en que fase del dia se produce el incidente. Para dicho proceso, se agrupa los datos según la c110.
Commo método de imputación, se decide realizar la mediana dada la distribución de los datos, por lo tanto, previamente se debe convertir la columna como numérica.
# Convertir la columna a numérica
df[col] = df[col].astype(float)
# Crear los grupos en orden según la fase del dia
groups = df.groupby(['c110'], group_keys=False, dropna=False)
print('[+] TOTAL_GROUPS\t', groups.ngroups)
groups.median(numeric_only=True)[[col]]
[+] TOTAL_GROUPS 6
| c10 | |
|---|---|
| c110 | |
| 1 | 13.0 |
| 2 | 19.0 |
| 3 | 7.0 |
| 4 | 18.0 |
| 5 | 15.0 |
| NaN | 0.0 |
# Aplicar la imputación para cada grupo
df[[col]] = groups[[col]].transform(lambda group: group.fillna(group.median()))
# Convertir la columna a categorica de nuevo dejando las horas en formato HH
df[col] = df[col].astype(int).astype(str)
df[col] = df[[col]].applymap(lambda x: x.zfill(2))
# Representacion de los datos
categorical_histogram(col, df)
df[[col]].describe(include='all')
[+] VALUES_ENCODED ['00' '01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '20' '21' '22' '23'] [+] SKEW -0.28848761729245215
| c10 | |
|---|---|
| count | 214664 |
| unique | 24 |
| top | 13 |
| freq | 37219 |
Tras la imputación de los datos, se observa que el sego a la izquierda ha crecido 0.02 puntos y que el registro con mayor incidencia es el de la hora 14.
plot_nan(df)
# Eliminar columnas indeseadas
df = df.drop(columns=['c243', 'c240', 'c30', 'c31', 'c11', 'c12', 'c13', 'c14'])
plot_nan(df)
Para tratar con los datos NaN restantes, se elimina los registros que tengan los datos nulos.
#df.dropna(inplace=True)
#df.reset_index(inplace=True, drop=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 214664 entries, 0 to 214663 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 c1 214632 non-null object 1 c7 214664 non-null float64 2 c10 214664 non-null object 3 c20 184376 non-null float64 4 c21 184365 non-null float64 5 c106 214664 non-null object 6 c108 214664 non-null object 7 c110 214216 non-null object 8 c112 214653 non-null object 9 c113 212495 non-null float64 10 c114 193891 non-null object 11 c115 73821 non-null object 12 c241 167686 non-null float64 13 c41 214664 non-null object 14 c49 214664 non-null object 15 c56 214664 non-null float64 16 c96 214664 non-null object dtypes: float64(6), object(11) memory usage: 27.8+ MB
En este punto, se asegura que los atributos que ha quedado en el dataset, contiene el tipo de datos esperado.
# Datos de la columna
df['c7'].unique()
array([ 1., 2., 3., 4., 5., 6., 7., 10., 9., 8., 11., 12.])
# Convertir los datos a entero y posteriormente a tipo string.
df['c7'] = df['c7'].astype(int).astype(str)
df['c7'].unique()
array(['1', '2', '3', '4', '5', '6', '7', '10', '9', '8', '11', '12'],
dtype=object)
# Clases del atributo
df['c96'].unique()
array(['IB', 'DD', 'HB', 'IA', 'BD', 'BI', 'IH', 'DE', 'DA', 'BC', 'KE',
'IE', 'BG', 'BF', 'KC', 'HA', 'GX', 'IC', 'EX', 'FE', 'BE', 'JX',
'AD', 'KG', 'KD', 'BA', 'BH', 'AB', 'AA', 'HD', 'KB', 'KF', 'IF',
'AC', 'HC', 'KA', 'ID', 'FD', 'IG', 'FF', 'DF', 'FG', 'BB', 'HH',
'HF', 'DB', 'FA', 'FB', 'DG', 'HG', 'FC', 'HE', 'DH', 'PJ', 'HM',
'SR', 'HK', 'PL'], dtype=object)
# En tierra
GND = ['AA', 'AB', 'AC', 'AD', 'FA', 'FB', 'FC', 'FD', 'FE', 'FF',
'FG', 'HB', 'HC', 'HD', 'HE', 'HF', 'HG', 'HH', 'HM']
# Aterrizando
LDG = ['BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'EX',
'HA', 'HK']
# Despegando
T_O = ['CX', 'IA', 'IB', 'IC', 'ID', 'IE', 'IF', 'IG', 'IH']
# Volando
AIRBNE = ['DA', 'DB', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'GX']
# Otros
OTH = ['JX', 'PJ', 'PL']
# Vuelos especiales
SPL = ['KA', 'KB', 'KC', 'KD', 'KE', 'KF', 'KG', 'SR']
# Aplicar resducción de características
def new_phase(value:str):
"""
Cambiar el tipo de fase de vuelo
"""
phase_map = {'GND': GND, 'LDG': LDG,
'T/O': T_O, 'AIRBNE': AIRBNE,
'SPL': SPL}
for phase, values in phase_map.items():
if value in values:
return phase
return 'OTH'
df['c96'] = df['c96'].apply(new_phase)
df['c96'].unique()
array(['T/O', 'AIRBNE', 'GND', 'LDG', 'SPL', 'OTH'], dtype=object)
A continuación, se realiza una serie de modificaciones en los datos para que estos puedan ser interpretados por el modelo.
# Separar datos categóricos de numericos y variable objetivo
cat_cols = []
num_cols = []
target = 'c1'
for col in df:
if col == target:
continue
if df[col].dtype == 'object':
cat_cols.append(col)
elif df[col].dtype in ['float64']:
num_cols.append(col)
print(f'[+] CAT_COLS\t{cat_cols}')
print(f'[+] NUM_COLS\t{num_cols}')
[+] CAT_COLS ['c7', 'c10', 'c106', 'c108', 'c110', 'c112', 'c114', 'c115', 'c41', 'c49', 'c96'] [+] NUM_COLS ['c20', 'c21', 'c113', 'c241', 'c56']
Existe una serie de columnas con datos categóricos que se deberá de codificar para poder tratar sus datos y realizar nuestra prediccion del modelo.
Hasta ahora se ha trabajado con las columnas categóricas y numéricas em base a sus datos, pero, para realizar la codificación necesaria, se va a realizar sobre todas las columnas que necesariamente serán categóricas para realizar el modelo.
Existe tres técnicas de Data Encoding dependiendo de las carácteristicas de los atributos:
A continuación se muestra los tipos de datos categoricos del dataset para seleccionar que tecnica emplear con cada atributo.
# LabelEncoder
col_le = [target]
# OrdinalEncoder
col_oe = ['c108', 'c49', 'c41', 'c114']
# OneHotEncoder
col_ohe = ['c106', 'c96', 'c115', 'c112']
# Cambio de tipo
col_retype = ['c110', 'c10', 'c7']
col_oe.extend(col_retype)
Para aquellos atributos que sus datos categoricos, son todos numéricos, simplemente se les aplica un cambio de tipo de datos a entero para poder ser interpretado por el modelo, col_retype. Para la variable dependiente u objetivo, c1, así como para los atributos de entrada cuyos datos tienen un orden de relación, se emplea LabelEncoder, col_le. Para los atributos cuyas clases no están relacionadas, se emplea OrdinalEncoder y OneHotEncoder como ya se ha mencionado. Los atributos que poseen pocas clases y además tienen una mayor importancia en el significado del estudio, se ha decidido emplear OneHotEncoder col_ohe. Para el resto de columnas y sin un fuerte significado en el estudio, se ha realizado un OrdinalEncoder, col_oe.
Para la transformación de los datos, se guarda el diccionario de los datos que se va a mapear para poder emplearlo en la entrada de nuevos datos.
# Instancia del codificador
le = LabelEncoder()
mapped_dic = {}
for col in col_le:
# Ajuste de los datos y transformación
df[col] = le.fit_transform(df[col])
# Añadir valores al diccionario
mapped_dic.update({col: dict(zip(le.transform(le.classes_), le.classes_))})
mapped_le = pd.DataFrame.from_dict(mapped_dic, orient='index').fillna('-')
mapped_le
| 0 | 1 | 2 | |
|---|---|---|---|
| c1 | A | I | - |
# Instancia del codificador
oe = OrdinalEncoder()
cat_cols_mapped = []
for col in col_oe:
df[col] = oe.fit_transform(df[[col]])
categories = list(oe.categories_[0])
# Añadir valores al diccionario
mapped_dic.update({col: {idx: cat for idx, cat in enumerate(categories)}})
# Añadir columnas a lista de columnas transformadas
cat_cols_mapped.append(col)
mapped_df = pd.DataFrame.from_dict(mapped_dic, orient='index').fillna('-')
mapped_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| c1 | A | I | - | - | - | - | - | - | - | - | ... | - | - | - | - | - | - | - | - | - | - |
| c108 | A | B | C | D | E | F | G | H | I | J | ... | O | P | Q | R | S | T | U | V | W | - |
| c49 | E | F | G | H | M | N | P | R | S | T | ... | - | - | - | - | - | - | - | - | - | - |
| c41 | 00 | 01 | 02 | 03 | 09 | F1 | F3 | F9 | TT | XX | ... | - | - | - | - | - | - | - | - | - | - |
| c114 | 1 | 10 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | - | - | - | - | - | - | - | - | - | - |
| c110 | 1 | 2 | 3 | 4 | 5 | - | - | - | - | - | ... | - | - | - | - | - | - | - | - | - | - |
| c10 | 00 | 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
| c7 | 1 | 10 | 11 | 12 | 2 | 3 | 4 | 5 | 6 | 7 | ... | - | - | - | - | - | - | - | - | - | - |
8 rows × 24 columns
# Almacenar diccionario en .csv para su posterior uso
mapped_df.to_csv(f'{model_info_dir}mapped_dictionary.csv', index=True)
#Recuperar datos
#map_df = pd.read_csv('Forecast/mapped_dictionary.csv', index_col=0)
#value_mapped = map_df.loc['c114'].eq('2')
#value_mapped = value_mapped.idxmax() if value_mapped.any() else None
#try:
# print(int(value_mapped))
#except:
# print('Valor no encontrado')
# Instancia del codificador
ohe = OneHotEncoder()
# Obtener los valores codificados
vals = ohe.fit_transform(df[col_ohe]).toarray()
feature_names = ohe.get_feature_names_out(col_ohe)
# Añadir columnas a lista de columnas transformadas
cat_cols_mapped.extend(feature_names)
# Creamos un DataFrame con las características codificadas OneHot
encoded_data = pd.DataFrame(vals, columns=feature_names)
# Concatenamos el DataFrame original con el DataFrame codificado OneHot
df = pd.concat([df.drop(col_ohe, axis=1), encoded_data], axis=1)
df.head()
| c1 | c7 | c10 | c20 | c21 | c108 | c110 | c113 | c114 | c241 | ... | c115_RAI | c115_SMK | c115_SNO | c115_nan | c112_BRKN | c112_CLER | c112_OBSC | c112_OCST | c112_SCAT | c112_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.0 | 18.0 | 39.200000 | -93.916667 | 18.0 | 3.0 | 38.0 | 1.0 | 8.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0 | 0.0 | 13.0 | 38.716667 | -88.183333 | 18.0 | 0.0 | 38.0 | 7.0 | 8.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 2 | 0 | 0.0 | 7.0 | NaN | NaN | 18.0 | 2.0 | 38.0 | 1.0 | 8.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0 | 0.0 | 13.0 | 41.683333 | -86.816667 | 18.0 | 0.0 | 38.0 | 7.0 | 8.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 4 | 0 | 0.0 | 13.0 | NaN | NaN | 18.0 | 0.0 | 38.0 | 7.0 | 8.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
5 rows × 39 columns
# Cambiar el tipo de datos a entero
#for col in col_retype:
# df[col] = df[col].astype(int)
# # Añadir columnas a lista de columnas transformadas
# cat_cols_mapped.append(col)
df[num_cols].describe()
| c20 | c21 | c113 | c241 | c56 | |
|---|---|---|---|---|---|
| count | 184376.000000 | 184365.000000 | 212495.000000 | 167686.000000 | 214664.000000 |
| mean | 38.606490 | -97.558997 | 1250.626697 | 8.352826 | 3046.764404 |
| std | 8.025867 | 22.163640 | 1746.859609 | 3.944927 | 4925.593126 |
| min | -163.866667 | -180.750000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 33.735714 | -112.066667 | 38.000000 | 8.000000 | 406.000000 |
| 50% | 38.682203 | -93.300000 | 38.000000 | 8.000000 | 1200.000000 |
| 75% | 42.016667 | -82.050000 | 2500.000000 | 8.000000 | 3195.000000 |
| max | 90.916667 | 180.000000 | 9999.000000 | 99.000000 | 92591.000000 |
En base a la desviación estandar de determinadas columnas, que son elevadas, se realiza el normalizado de los datos y el escalado entre 0 y 1.
# Columnas a aplicar diferentes métodos
std_cols = ['c21', 'c113', 'c56']
min_max_cols = ['c20', 'c241']
# Instancia de los normalizados
std_scaler = StandardScaler(with_mean=False)
min_max_scaler = MinMaxScaler()
# Ajuste de las instancias a los datos y guardado de estos
std_scaler.fit(df[std_cols])
min_max_scaler.fit(df[min_max_cols])
joblib.dump(std_scaler, f'{model_info_dir}scaler_{"_".join(std_cols)}.joblib')
joblib.dump(min_max_scaler, f'{model_info_dir}scaler_{"_".join(min_max_cols)}.joblib')
#scaler_new = joblib.load('scaler.joblib')
#df['c31'] = scaler_new.transform(df[['c31']])
# Transformación de los datos
df[std_cols] = std_scaler.transform(df[std_cols])
df[min_max_cols] = min_max_scaler.transform(df[min_max_cols])
df.describe()
| c1 | c7 | c10 | c20 | c21 | c108 | c110 | c113 | c114 | c241 | ... | c115_RAI | c115_SMK | c115_SNO | c115_nan | c112_BRKN | c112_CLER | c112_OBSC | c112_OCST | c112_SCAT | c112_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 214664.000000 | 214664.000000 | 214664.000000 | 184376.000000 | 184365.000000 | 214664.000000 | 214216.000000 | 212495.000000 | 193891.000000 | 167686.000000 | ... | 214664.000000 | 214664.000000 | 214664.000000 | 214664.000000 | 214664.000000 | 214664.000000 | 214664.000000 | 214664.000000 | 214664.000000 | 214664.000000 |
| mean | 0.498076 | 6.037370 | 13.975138 | 0.794688 | -4.401771 | 16.787091 | 0.529456 | 0.715930 | 7.278069 | 0.084372 | ... | 0.016584 | 0.000587 | 0.007444 | 0.656109 | 0.065516 | 0.386954 | 0.013342 | 0.439464 | 0.094674 | 0.000051 |
| std | 0.500296 | 3.418477 | 4.038049 | 0.031501 | 1.000003 | 3.867490 | 1.144566 | 1.000002 | 3.542568 | 0.039848 | ... | 0.127707 | 0.024220 | 0.085958 | 0.475006 | 0.247435 | 0.487054 | 0.114734 | 0.496323 | 0.292764 | 0.007158 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -8.155271 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 3.000000 | 11.000000 | 0.775570 | -5.056343 | 18.000000 | 0.000000 | 0.021753 | 5.000000 | 0.080808 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 6.000000 | 14.000000 | 0.794985 | -4.209609 | 18.000000 | 0.000000 | 0.021753 | 10.000000 | 0.080808 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 1.000000 | 9.000000 | 17.000000 | 0.808072 | -3.702019 | 18.000000 | 0.000000 | 1.431143 | 10.000000 | 0.080808 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
| max | 2.000000 | 11.000000 | 23.000000 | 1.000000 | 8.121432 | 22.000000 | 4.000000 | 5.724000 | 11.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 39 columns
plot_nan(df)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 214664 entries, 0 to 214663 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 c1 214664 non-null int32 1 c7 214664 non-null float64 2 c10 214664 non-null float64 3 c20 184376 non-null float64 4 c21 184365 non-null float64 5 c108 214664 non-null float64 6 c110 214216 non-null float64 7 c113 212495 non-null float64 8 c114 193891 non-null float64 9 c241 167686 non-null float64 10 c41 214664 non-null float64 11 c49 214664 non-null float64 12 c56 214664 non-null float64 13 c106_A 214664 non-null float64 14 c106_B 214664 non-null float64 15 c106_C 214664 non-null float64 16 c96_AIRBNE 214664 non-null float64 17 c96_GND 214664 non-null float64 18 c96_LDG 214664 non-null float64 19 c96_OTH 214664 non-null float64 20 c96_SPL 214664 non-null float64 21 c96_T/O 214664 non-null float64 22 c115_BLS 214664 non-null float64 23 c115_DRZ 214664 non-null float64 24 c115_DST 214664 non-null float64 25 c115_FOG 214664 non-null float64 26 c115_HAZ 214664 non-null float64 27 c115_NONE 214664 non-null float64 28 c115_OTH 214664 non-null float64 29 c115_RAI 214664 non-null float64 30 c115_SMK 214664 non-null float64 31 c115_SNO 214664 non-null float64 32 c115_nan 214664 non-null float64 33 c112_BRKN 214664 non-null float64 34 c112_CLER 214664 non-null float64 35 c112_OBSC 214664 non-null float64 36 c112_OCST 214664 non-null float64 37 c112_SCAT 214664 non-null float64 38 c112_nan 214664 non-null float64 dtypes: float64(38), int32(1) memory usage: 63.1 MB
Eliminar NaN para poder predecir los modelos
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
corr = df.corr()
corr.style.background_gradient(cmap='Blues')
| c1 | c7 | c10 | c20 | c21 | c108 | c110 | c113 | c114 | c241 | c41 | c49 | c56 | c106_A | c106_B | c106_C | c96_AIRBNE | c96_GND | c96_LDG | c96_OTH | c96_SPL | c96_T/O | c115_BLS | c115_DRZ | c115_DST | c115_FOG | c115_HAZ | c115_NONE | c115_OTH | c115_RAI | c115_SMK | c115_SNO | c115_nan | c112_BRKN | c112_CLER | c112_OBSC | c112_OCST | c112_SCAT | c112_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| c1 | 1.000000 | -0.070863 | 0.016290 | -0.048699 | 0.099790 | 0.140198 | -0.025267 | -0.126082 | 0.194456 | 0.021056 | -0.020941 | 0.254614 | 0.052979 | -0.055329 | -0.281742 | 0.357267 | -0.018543 | 0.067325 | 0.022960 | -0.026256 | -0.052605 | -0.084736 | -0.000654 | -0.002262 | 0.000979 | -0.036486 | 0.003433 | -0.110984 | -0.008437 | -0.001396 | -0.006058 | 0.006183 | 0.112941 | -0.048806 | -0.142594 | -0.024711 | 0.201998 | -0.054336 | 0.001979 |
| c7 | -0.070863 | 1.000000 | -0.004269 | 0.080885 | -0.039872 | 0.045498 | -0.026713 | 0.017097 | -0.007894 | -0.048215 | -0.017021 | -0.014936 | -0.008232 | -0.098020 | 0.080878 | -0.028357 | 0.012648 | -0.016672 | -0.012364 | -0.000420 | -0.008695 | 0.027539 | -0.041961 | -0.021342 | -0.000885 | -0.057095 | 0.040071 | 0.021340 | -0.015872 | -0.005112 | 0.001704 | -0.077582 | 0.006110 | 0.001275 | 0.022845 | -0.051857 | -0.036189 | 0.042584 | 0.000302 |
| c10 | 0.016290 | -0.004269 | 1.000000 | 0.031158 | 0.016011 | 0.006441 | 0.135588 | 0.041993 | -0.034703 | 0.010798 | -0.050868 | -0.000040 | -0.033158 | 0.005537 | -0.016646 | 0.015371 | 0.005388 | -0.032649 | 0.044935 | 0.009513 | 0.003028 | -0.018062 | -0.001542 | -0.001143 | 0.004531 | -0.028252 | 0.005945 | 0.073226 | 0.001557 | 0.019598 | 0.003055 | 0.004627 | -0.067042 | 0.018653 | 0.049656 | -0.010968 | -0.070744 | 0.025965 | -0.004468 |
| c20 | -0.048699 | 0.080885 | 0.031158 | 1.000000 | -0.364804 | -0.040066 | 0.008684 | 0.028940 | 0.040391 | 0.007521 | -0.027922 | -0.008221 | -0.007940 | 0.033159 | -0.009276 | -0.010125 | -0.016079 | 0.006911 | -0.023249 | -0.007169 | -0.003216 | 0.040666 | 0.038172 | 0.007435 | -0.004762 | 0.013598 | -0.038521 | 0.023635 | 0.005006 | 0.013410 | 0.001143 | 0.056722 | -0.030380 | 0.010352 | 0.000075 | 0.027711 | 0.006814 | -0.031226 | -0.001907 |
| c21 | 0.099790 | -0.039872 | 0.016011 | -0.364804 | 1.000000 | 0.024413 | -0.000994 | -0.041153 | -0.062546 | 0.005238 | 0.012967 | -0.010630 | -0.024143 | 0.023703 | -0.062560 | 0.055573 | -0.003578 | -0.010877 | 0.031276 | -0.011944 | -0.011877 | -0.013084 | -0.012031 | 0.004599 | -0.007653 | 0.020004 | 0.047597 | -0.085339 | 0.004190 | 0.005934 | -0.012917 | -0.005393 | 0.055084 | 0.011393 | -0.068721 | -0.004440 | 0.052207 | 0.017602 | -0.002234 |
| c108 | 0.140198 | 0.045498 | 0.006441 | -0.040066 | 0.024413 | 1.000000 | 0.065681 | -0.044305 | 0.049633 | -0.207331 | 0.001697 | 0.148150 | 0.015741 | -0.348452 | 0.081292 | 0.134315 | -0.007594 | 0.089698 | -0.067026 | -0.046189 | -0.041541 | -0.014793 | -0.049874 | -0.060599 | -0.034301 | -0.270817 | 0.009111 | -0.008747 | -0.037571 | -0.160739 | -0.000406 | -0.089721 | 0.164796 | -0.054934 | 0.103996 | -0.166864 | -0.032076 | -0.006624 | -0.007683 |
| c110 | -0.025267 | -0.026713 | 0.135588 | 0.008684 | -0.000994 | 0.065681 | 1.000000 | -0.187137 | -0.066235 | -0.032285 | -0.021138 | 0.085203 | 0.007212 | -0.010706 | 0.067272 | -0.069709 | -0.113638 | 0.257706 | -0.107214 | -0.005139 | -0.024267 | -0.104452 | -0.003157 | -0.002293 | -0.003911 | -0.006372 | -0.045123 | -0.137889 | -0.004065 | -0.015675 | -0.005604 | -0.004625 | 0.152213 | -0.070819 | 0.356284 | -0.002699 | -0.257334 | -0.095082 | -0.000773 |
| c113 | -0.126082 | 0.017097 | 0.041993 | 0.028940 | -0.041153 | -0.044305 | -0.187137 | 1.000000 | 0.032977 | 0.064955 | 0.030555 | 0.159608 | -0.006176 | -0.057656 | 0.251374 | -0.248788 | 0.074236 | -0.189161 | 0.083391 | -0.007418 | 0.010416 | 0.087629 | -0.001319 | -0.015277 | 0.020077 | -0.028721 | 0.138377 | 0.559807 | -0.015254 | 0.079452 | 0.009378 | 0.010756 | -0.578710 | 0.382594 | -0.013487 | -0.036314 | -0.398548 | 0.389328 | nan |
| c114 | 0.194456 | -0.007894 | -0.034703 | 0.040391 | -0.062546 | 0.049633 | -0.066235 | 0.032977 | 1.000000 | 0.009576 | 0.014672 | 0.046504 | -0.001602 | -0.110792 | -0.134594 | 0.225436 | 0.057711 | -0.138870 | 0.066288 | 0.004949 | -0.001330 | 0.041810 | -0.007822 | -0.030765 | -0.012576 | -0.090605 | -0.101654 | 0.335947 | -0.039154 | -0.086953 | -0.012102 | -0.056555 | -0.201752 | -0.038739 | -0.357044 | -0.005656 | 0.355387 | -0.000071 | nan |
| c241 | 0.021056 | -0.048215 | 0.010798 | 0.007521 | 0.005238 | -0.207331 | -0.032285 | 0.064955 | 0.009576 | 1.000000 | 0.001521 | 0.029386 | -0.000694 | 0.028817 | 0.012161 | -0.032795 | -0.012661 | 0.015781 | 0.015585 | 0.001066 | -0.005602 | -0.023538 | 0.067990 | 0.000547 | 0.070985 | -0.026327 | -0.024646 | 0.028133 | 0.009259 | 0.061445 | -0.004302 | 0.046560 | -0.044754 | 0.052019 | -0.046498 | 0.003817 | -0.014332 | 0.042487 | 0.010079 |
| c41 | -0.020941 | -0.017021 | -0.050868 | -0.027922 | 0.012967 | 0.001697 | -0.021138 | 0.030555 | 0.014672 | 0.001521 | 1.000000 | 0.040658 | 0.160796 | -0.015636 | 0.041679 | -0.037333 | -0.021927 | 0.007990 | 0.017169 | -0.028661 | 0.031786 | -0.006571 | 0.007529 | 0.004928 | -0.003213 | -0.008738 | 0.013974 | 0.010925 | 0.001741 | -0.003569 | 0.004325 | 0.003540 | -0.013598 | 0.005435 | -0.014371 | 0.000568 | -0.000137 | 0.019359 | -0.001450 |
| c49 | 0.254614 | -0.014936 | -0.000040 | -0.008221 | -0.010630 | 0.148150 | 0.085203 | 0.159608 | 0.046504 | 0.029386 | 0.040658 | 1.000000 | 0.049722 | -0.044731 | -0.041731 | 0.076719 | -0.062828 | 0.146347 | -0.041491 | -0.048335 | -0.025451 | -0.068620 | 0.013023 | 0.009641 | 0.003917 | 0.029196 | 0.042994 | 0.074984 | 0.010348 | 0.027983 | 0.004771 | 0.022136 | -0.109284 | 0.056541 | 0.147097 | 0.024677 | -0.221646 | 0.073517 | 0.002677 |
| c56 | 0.052979 | -0.008232 | -0.033158 | -0.007940 | -0.024143 | 0.015741 | 0.007212 | -0.006176 | -0.001602 | -0.000694 | 0.160796 | 0.049722 | 1.000000 | 0.037091 | -0.029070 | 0.008981 | 0.024362 | 0.015360 | -0.046559 | -0.021966 | -0.001772 | 0.013976 | 0.016238 | 0.007250 | 0.001905 | 0.019944 | -0.004340 | -0.022469 | 0.007402 | 0.025854 | 0.003630 | 0.026383 | 0.001163 | 0.007857 | -0.015960 | 0.020120 | 0.012111 | -0.008547 | 0.001720 |
| c106_A | -0.055329 | -0.098020 | 0.005537 | 0.033159 | 0.023703 | -0.348452 | -0.010706 | -0.057656 | -0.110792 | 0.028817 | -0.015636 | -0.044731 | 0.037091 | 1.000000 | -0.487577 | -0.095548 | 0.069700 | -0.107007 | 0.045055 | 0.057245 | 0.086382 | -0.020689 | 0.131674 | 0.109278 | 0.014058 | 0.480267 | 0.006664 | -0.099217 | 0.143017 | 0.241337 | 0.006187 | 0.244784 | -0.207948 | 0.052418 | -0.150183 | 0.389279 | 0.066220 | -0.059230 | -0.001704 |
| c106_B | -0.281742 | 0.080878 | -0.016646 | -0.009276 | -0.062560 | 0.081292 | 0.067272 | 0.251374 | -0.134594 | 0.012161 | 0.041679 | -0.041731 | -0.029070 | -0.487577 | 1.000000 | -0.822498 | -0.105650 | 0.116382 | -0.014874 | -0.061564 | -0.028735 | -0.001838 | -0.064863 | -0.054688 | -0.003025 | -0.228309 | 0.049620 | 0.244715 | -0.070294 | -0.114377 | 0.002071 | -0.117069 | -0.102453 | 0.053830 | 0.316850 | -0.189514 | -0.376594 | 0.140085 | 0.000192 |
| c106_C | 0.357267 | -0.028357 | 0.015371 | -0.010125 | 0.055573 | 0.134315 | -0.069709 | -0.248788 | 0.225436 | -0.032795 | -0.037333 | 0.076719 | 0.008981 | -0.095548 | -0.822498 | 1.000000 | 0.075048 | -0.062980 | -0.012393 | 0.032899 | -0.023512 | 0.015573 | -0.011827 | -0.008838 | -0.005709 | -0.052567 | -0.060915 | -0.214373 | -0.013024 | -0.026814 | -0.006392 | -0.025991 | 0.252278 | -0.095521 | -0.263414 | -0.037525 | 0.386228 | -0.121130 | 0.000891 |
| c96_AIRBNE | -0.018543 | 0.012648 | 0.005388 | -0.016079 | -0.003578 | -0.007594 | -0.113638 | 0.074236 | 0.057711 | -0.012661 | -0.021927 | -0.062828 | 0.024362 | 0.069700 | -0.105650 | 0.075048 | 1.000000 | -0.428913 | -0.267035 | -0.051962 | -0.042717 | -0.189130 | 0.004113 | 0.005730 | 0.000008 | 0.025666 | 0.030051 | 0.047693 | 0.009714 | 0.021620 | 0.011818 | 0.008282 | -0.072917 | 0.039147 | -0.110736 | 0.010454 | 0.069367 | 0.029532 | -0.003529 |
| c96_GND | 0.067325 | -0.016672 | -0.032649 | 0.006911 | -0.010877 | 0.089698 | 0.257706 | -0.189161 | -0.138870 | 0.015781 | 0.007990 | 0.146347 | 0.015360 | -0.107007 | 0.116382 | -0.062980 | -0.428913 | 1.000000 | -0.471346 | -0.091718 | -0.075401 | -0.333836 | -0.002060 | -0.014458 | 0.001981 | -0.071497 | -0.054832 | -0.137154 | -0.016892 | -0.029458 | -0.010600 | -0.019429 | 0.184231 | -0.081543 | 0.230935 | -0.041244 | -0.124836 | -0.087517 | 0.001656 |
| c96_LDG | 0.022960 | -0.012364 | 0.044935 | -0.023249 | 0.031276 | -0.067026 | -0.107214 | 0.083391 | 0.066288 | 0.015585 | 0.017169 | -0.041491 | -0.046559 | 0.045055 | -0.014874 | -0.012393 | -0.267035 | -0.471346 | 1.000000 | -0.057102 | -0.046943 | -0.207841 | -0.002697 | 0.011467 | 0.001143 | 0.044706 | 0.028428 | 0.066981 | 0.006696 | 0.010044 | 0.000649 | 0.001353 | -0.090829 | 0.037366 | -0.102481 | 0.025131 | 0.049153 | 0.045638 | 0.003891 |
| c96_OTH | -0.026256 | -0.000420 | 0.009513 | -0.007169 | -0.011944 | -0.046189 | -0.005139 | -0.007418 | 0.004949 | 0.001066 | -0.028661 | -0.048335 | -0.021966 | 0.057245 | -0.061564 | 0.032899 | -0.051962 | -0.091718 | -0.057102 | 1.000000 | -0.009135 | -0.040443 | 0.000207 | 0.001438 | 0.004294 | 0.030509 | -0.003231 | -0.006906 | 0.004508 | 0.020951 | 0.001135 | 0.008546 | -0.010043 | 0.008027 | -0.030993 | 0.022405 | 0.027397 | -0.010430 | -0.000755 |
| c96_SPL | -0.052605 | -0.008695 | 0.003028 | -0.003216 | -0.011877 | -0.041541 | -0.024267 | 0.010416 | -0.001330 | -0.005602 | 0.031786 | -0.025451 | -0.001772 | 0.086382 | -0.028735 | -0.023512 | -0.042717 | -0.075401 | -0.046943 | -0.009135 | 1.000000 | -0.033248 | 0.003309 | 0.009707 | -0.001794 | 0.054117 | 0.006677 | -0.003728 | 0.023132 | 0.029885 | 0.002372 | 0.028410 | -0.032584 | 0.016674 | -0.033152 | 0.043738 | 0.014171 | -0.000088 | -0.000620 |
| c96_T/O | -0.084736 | 0.027539 | -0.018062 | 0.040666 | -0.013084 | -0.014793 | -0.104452 | 0.087629 | 0.041810 | -0.023538 | -0.006571 | -0.068620 | 0.013976 | -0.020689 | -0.001838 | 0.015573 | -0.189130 | -0.333836 | -0.207841 | -0.040443 | -0.033248 | 1.000000 | 0.000635 | -0.002691 | -0.005253 | -0.003982 | 0.009244 | 0.065760 | -0.002246 | -0.008814 | -0.000097 | 0.007284 | -0.061068 | 0.020736 | -0.064107 | -0.001032 | 0.028850 | 0.040689 | -0.002747 |
| c115_BLS | -0.000654 | -0.041961 | -0.001542 | 0.038172 | -0.012031 | -0.049874 | -0.003157 | -0.001319 | -0.007822 | 0.067990 | 0.007529 | 0.013023 | 0.016238 | 0.131674 | -0.064863 | -0.011827 | 0.004113 | -0.002060 | -0.002697 | 0.000207 | 0.003309 | 0.000635 | 1.000000 | -0.001990 | -0.000934 | -0.006935 | -0.008278 | -0.026571 | -0.001836 | -0.005858 | -0.001093 | -0.003907 | -0.062314 | 0.000600 | -0.032230 | 0.212999 | -0.011593 | -0.010701 | -0.000323 |
| c115_DRZ | -0.002262 | -0.021342 | -0.001143 | 0.007435 | 0.004599 | -0.060599 | -0.002293 | -0.015277 | -0.030765 | 0.000547 | 0.004928 | 0.009641 | 0.007250 | 0.109278 | -0.054688 | -0.008838 | 0.005730 | -0.014458 | 0.011467 | 0.001438 | 0.009707 | -0.002691 | -0.001990 | 1.000000 | -0.000914 | -0.006782 | -0.008095 | -0.025985 | -0.001796 | -0.005729 | -0.001069 | -0.003821 | -0.060938 | 0.007987 | -0.034399 | 0.017923 | 0.006553 | 0.032351 | -0.000316 |
| c115_DST | 0.000979 | -0.000885 | 0.004531 | -0.004762 | -0.007653 | -0.034301 | -0.003911 | 0.020077 | -0.012576 | 0.070985 | -0.003213 | 0.003917 | 0.001905 | 0.014058 | -0.003025 | -0.005709 | 0.000008 | 0.001981 | 0.001143 | 0.004294 | -0.001794 | -0.005253 | -0.000934 | -0.000914 | 1.000000 | -0.003183 | -0.003799 | -0.012196 | -0.000843 | -0.002689 | -0.000502 | -0.001793 | -0.028601 | 0.011800 | -0.005822 | 0.036826 | -0.013800 | 0.000223 | 0.345717 |
| c115_FOG | -0.036486 | -0.057095 | -0.028252 | 0.013598 | 0.020004 | -0.270817 | -0.006372 | -0.028721 | -0.090605 | -0.026327 | -0.008738 | 0.029196 | 0.019944 | 0.480267 | -0.228309 | -0.052567 | 0.025666 | -0.071497 | 0.044706 | 0.030509 | 0.054117 | -0.003982 | -0.006935 | -0.006782 | -0.003183 | 1.000000 | -0.028208 | -0.090546 | -0.006257 | -0.019963 | -0.003726 | -0.013313 | -0.212342 | 0.047049 | -0.111308 | 0.363867 | 0.015938 | -0.024179 | -0.001100 |
| c115_HAZ | 0.003433 | 0.040071 | 0.005945 | -0.038521 | 0.047597 | 0.009111 | -0.045123 | 0.138377 | -0.101654 | -0.024646 | 0.013974 | 0.042994 | -0.004340 | 0.006664 | 0.049620 | -0.060915 | 0.030051 | -0.054832 | 0.028428 | -0.003231 | 0.006677 | 0.009244 | -0.008278 | -0.008095 | -0.003799 | -0.028208 | 1.000000 | -0.108074 | -0.007468 | -0.023828 | -0.004447 | -0.015891 | -0.253448 | 0.087383 | -0.023308 | 0.056659 | -0.093838 | 0.101833 | -0.001314 |
| c115_NONE | -0.110984 | 0.021340 | 0.073226 | 0.023635 | -0.085339 | -0.008747 | -0.137889 | 0.559807 | 0.335947 | 0.028133 | 0.010925 | 0.074984 | -0.022469 | -0.099217 | 0.244715 | -0.214373 | 0.047693 | -0.137154 | 0.066981 | -0.006906 | -0.003728 | 0.065760 | -0.026571 | -0.025985 | -0.012196 | -0.090546 | -0.108074 | 1.000000 | -0.023972 | -0.076487 | -0.014274 | -0.051008 | -0.813555 | 0.207461 | 0.252291 | -0.065427 | -0.521517 | 0.314811 | -0.004216 |
| c115_OTH | -0.008437 | -0.015872 | 0.001557 | 0.005006 | 0.004190 | -0.037571 | -0.004065 | -0.015254 | -0.039154 | 0.009259 | 0.001741 | 0.010348 | 0.007402 | 0.143017 | -0.070294 | -0.013024 | 0.009714 | -0.016892 | 0.006696 | 0.004508 | 0.023132 | -0.002246 | -0.001836 | -0.001796 | -0.000843 | -0.006257 | -0.007468 | -0.023972 | 1.000000 | -0.005285 | -0.000986 | -0.003525 | -0.056218 | 0.006830 | -0.029275 | 0.062217 | 0.014781 | -0.006504 | -0.000291 |
| c115_RAI | -0.001396 | -0.005112 | 0.019598 | 0.013410 | 0.005934 | -0.160739 | -0.015675 | 0.079452 | -0.086953 | 0.061445 | -0.003569 | 0.027983 | 0.025854 | 0.241337 | -0.114377 | -0.026814 | 0.021620 | -0.029458 | 0.010044 | 0.020951 | 0.029885 | -0.008814 | -0.005858 | -0.005729 | -0.002689 | -0.019963 | -0.023828 | -0.076487 | -0.005285 | 1.000000 | -0.003147 | -0.011246 | -0.179372 | 0.093136 | -0.101524 | 0.034815 | 0.056262 | -0.018819 | -0.000930 |
| c115_SMK | -0.006058 | 0.001704 | 0.003055 | 0.001143 | -0.012917 | -0.000406 | -0.005604 | 0.009378 | -0.012102 | -0.004302 | 0.004325 | 0.004771 | 0.003630 | 0.006187 | 0.002071 | -0.006392 | 0.011818 | -0.010600 | 0.000649 | 0.001135 | 0.002372 | -0.000097 | -0.001093 | -0.001069 | -0.000502 | -0.003726 | -0.004447 | -0.014274 | -0.000986 | -0.003147 | 1.000000 | -0.002099 | -0.033474 | 0.004466 | -0.003458 | 0.022328 | -0.016808 | 0.021727 | -0.000173 |
| c115_SNO | 0.006183 | -0.077582 | 0.004627 | 0.056722 | -0.005393 | -0.089721 | -0.004625 | 0.010756 | -0.056555 | 0.046560 | 0.003540 | 0.022136 | 0.026383 | 0.244784 | -0.117069 | -0.025991 | 0.008282 | -0.019429 | 0.001353 | 0.008546 | 0.028410 | 0.007284 | -0.003907 | -0.003821 | -0.001793 | -0.013313 | -0.015891 | -0.051008 | -0.003525 | -0.011246 | -0.002099 | 1.000000 | -0.119621 | 0.019560 | -0.067135 | 0.219964 | 0.017660 | -0.020971 | -0.000620 |
| c115_nan | 0.112941 | 0.006110 | -0.067042 | -0.030380 | 0.055084 | 0.164796 | 0.152213 | -0.578710 | -0.201752 | -0.044754 | -0.013598 | -0.109284 | 0.001163 | -0.207948 | -0.102453 | 0.252278 | -0.072917 | 0.184231 | -0.090829 | -0.010043 | -0.032584 | -0.061068 | -0.062314 | -0.060938 | -0.028601 | -0.212342 | -0.253448 | -0.813555 | -0.056218 | -0.179372 | -0.033474 | -0.119621 | 1.000000 | -0.269223 | -0.139736 | -0.155065 | 0.492502 | -0.313918 | -0.009888 |
| c112_BRKN | -0.048806 | 0.001275 | 0.018653 | 0.010352 | 0.011393 | -0.054934 | -0.070819 | 0.382594 | -0.038739 | 0.052019 | 0.005435 | 0.056541 | 0.007857 | 0.052418 | 0.053830 | -0.095521 | 0.039147 | -0.081543 | 0.037366 | 0.008027 | 0.016674 | 0.020736 | 0.000600 | 0.007987 | 0.011800 | 0.047049 | 0.087383 | 0.207461 | 0.006830 | 0.093136 | 0.004466 | 0.019560 | -0.269223 | 1.000000 | -0.210364 | -0.030790 | -0.234449 | -0.085625 | -0.001895 |
| c112_CLER | -0.142594 | 0.022845 | 0.049656 | 0.000075 | -0.068721 | 0.103996 | 0.356284 | -0.013487 | -0.357044 | -0.046498 | -0.014371 | 0.147097 | -0.015960 | -0.150183 | 0.316850 | -0.263414 | -0.110736 | 0.230935 | -0.102481 | -0.030993 | -0.033152 | -0.064107 | -0.032230 | -0.034399 | -0.005822 | -0.111308 | -0.023308 | 0.252291 | -0.029275 | -0.101524 | -0.003458 | -0.067135 | -0.139736 | -0.210364 | 1.000000 | -0.092386 | -0.703465 | -0.256918 | -0.005687 |
| c112_OBSC | -0.024711 | -0.051857 | -0.010968 | 0.027711 | -0.004440 | -0.166864 | -0.002699 | -0.036314 | -0.005656 | 0.003817 | 0.000568 | 0.024677 | 0.020120 | 0.389279 | -0.189514 | -0.037525 | 0.010454 | -0.041244 | 0.025131 | 0.022405 | 0.043738 | -0.001032 | 0.212999 | 0.017923 | 0.036826 | 0.363867 | 0.056659 | -0.065427 | 0.062217 | 0.034815 | 0.022328 | 0.219964 | -0.155065 | -0.030790 | -0.092386 | 1.000000 | -0.102963 | -0.037604 | -0.000832 |
| c112_OCST | 0.201998 | -0.036189 | -0.070744 | 0.006814 | 0.052207 | -0.032076 | -0.257334 | -0.398548 | 0.355387 | -0.014332 | -0.000137 | -0.221646 | 0.012111 | 0.066220 | -0.376594 | 0.386228 | 0.069367 | -0.124836 | 0.049153 | 0.027397 | 0.014171 | 0.028850 | -0.011593 | 0.006553 | -0.013800 | 0.015938 | -0.093838 | -0.521517 | 0.014781 | 0.056262 | -0.016808 | 0.017660 | 0.492502 | -0.234449 | -0.703465 | -0.102963 | 1.000000 | -0.286333 | -0.006339 |
| c112_SCAT | -0.054336 | 0.042584 | 0.025965 | -0.031226 | 0.017602 | -0.006624 | -0.095082 | 0.389328 | -0.000071 | 0.042487 | 0.019359 | 0.073517 | -0.008547 | -0.059230 | 0.140085 | -0.121130 | 0.029532 | -0.087517 | 0.045638 | -0.010430 | -0.000088 | 0.040689 | -0.010701 | 0.032351 | 0.000223 | -0.024179 | 0.101833 | 0.314811 | -0.006504 | -0.018819 | 0.021727 | -0.020971 | -0.313918 | -0.085625 | -0.256918 | -0.037604 | -0.286333 | 1.000000 | -0.002315 |
| c112_nan | 0.001979 | 0.000302 | -0.004468 | -0.001907 | -0.002234 | -0.007683 | -0.000773 | nan | nan | 0.010079 | -0.001450 | 0.002677 | 0.001720 | -0.001704 | 0.000192 | 0.000891 | -0.003529 | 0.001656 | 0.003891 | -0.000755 | -0.000620 | -0.002747 | -0.000323 | -0.000316 | 0.345717 | -0.001100 | -0.001314 | -0.004216 | -0.000291 | -0.000930 | -0.000173 | -0.000620 | -0.009888 | -0.001895 | -0.005687 | -0.000832 | -0.006339 | -0.002315 | 1.000000 |
# Eliminar la diagonal principal y los valores debajo de esta, manteniendo solo los valores superiores
# para ver solo valores unicos diferentes de 1 (relacion con diferentes atributos)
corr_filter = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool_))
# Ordenar de mayor a menor
corr_sort = corr_filter.stack().sort_values(ascending=False).to_frame()
# Añadir nombres a las columnas para entender las relaciones
corr_sort = corr_sort.rename(columns=leyend_dict, index=leyend_dict)
corr_sort.head(10)
| 0 | ||
|---|---|---|
| Cloud ceiling | c115_NONE | 0.559807 |
| c115_nan | c112_OCST | 0.492502 |
| c106_A | c115_FOG | 0.480267 |
| Cloud ceiling | c112_SCAT | 0.389328 |
| c106_A | c112_OBSC | 0.389279 |
| c106_C | c112_OCST | 0.386228 |
| Cloud ceiling | c112_BRKN | 0.382594 |
| c115_FOG | c112_OBSC | 0.363867 |
| Type of Event | c106_C | 0.357267 |
| Light condition code | c112_CLER | 0.356284 |
Crear el modelo de predicción con el conjunto de datos obtenido.
Para el modelado, se emplea unos datos de entrenamiento (train) y unos de prueba (test). Una vez el modelo está realizado, existe los datos de validación, estos no son vistos por el modelo durante el train o test.
A continuación, se parte los datos en:
## sample devuelve una muestra aleatoria de un eje del objeto
data = df.sample(frac=0.95, random_state=786)
data
| c1 | c7 | c10 | c20 | c21 | c108 | c110 | c113 | c114 | c241 | ... | c115_RAI | c115_SMK | c115_SNO | c115_nan | c112_BRKN | c112_CLER | c112_OBSC | c112_OCST | c112_SCAT | c112_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 39412 | 0 | 11.0 | 14.0 | 0.826716 | -5.557165 | 18.0 | 0.0 | 1.431143 | 10.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 137800 | 0 | 6.0 | 19.0 | 0.762452 | -3.901295 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.101010 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 113829 | 1 | 4.0 | 16.0 | 0.728914 | -7.125052 | 22.0 | 0.0 | 0.021753 | 10.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 86422 | 1 | 5.0 | 20.0 | 0.767973 | -4.799165 | 18.0 | 1.0 | 5.152115 | 10.0 | 0.050505 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 5646 | 0 | 11.0 | 13.0 | 0.809708 | -3.792257 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24016 | 0 | 8.0 | 11.0 | 0.754364 | -3.640357 | 18.0 | 0.0 | 1.431143 | 10.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 43704 | 0 | 11.0 | 19.0 | 0.798086 | -3.364378 | 6.0 | 1.0 | 2.003600 | 1.0 | 0.151515 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 20908 | 1 | 2.0 | 14.0 | 0.790607 | -5.521070 | 18.0 | 0.0 | 0.021753 | 10.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 82115 | 0 | 10.0 | 13.0 | 0.827903 | -5.438351 | 18.0 | 0.0 | 1.431143 | 10.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 54017 | 1 | 10.0 | 10.0 | 0.777458 | -5.373681 | 18.0 | 0.0 | 0.005725 | 1.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
131123 rows × 39 columns
ax = data.c1.value_counts().plot(kind='bar', title='Tipo de evento - DATOS ENTRENAMIENTO')
ax.set_ylabel('Frecuencia')
# Rotar nombres de atributos
for label in ax.xaxis.get_ticklabels():
label.set_rotation(15)
plt.show()
Para el entrenamiento, existe un desbalanceo hacia la clase 0 (Accidente) por lo que se deberá tener en cuenta este a la hora del entrenamiento.
# eliminamos del conjunto de datos original estos datos aleatorios para la validación
data_unseen = df.drop(data.index)
data_unseen
| c1 | c7 | c10 | c20 | c21 | c108 | c110 | c113 | c114 | c241 | ... | c115_RAI | c115_SMK | c115_SNO | c115_nan | c112_BRKN | c112_CLER | c112_OBSC | c112_OCST | c112_SCAT | c112_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 57 | 0 | 0.0 | 13.0 | 0.745894 | -3.617045 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 72 | 0 | 0.0 | 13.0 | 0.786976 | -3.841888 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 97 | 0 | 0.0 | 19.0 | 0.774605 | -4.586353 | 3.0 | 1.0 | 0.021753 | 5.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 150 | 0 | 4.0 | 13.0 | 0.895473 | -6.670854 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 158 | 0 | 4.0 | 13.0 | 0.815791 | -3.429049 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 137950 | 0 | 6.0 | 17.0 | 0.823314 | -5.031528 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.111111 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 137959 | 1 | 5.0 | 11.0 | 0.800877 | -3.344075 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.262626 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 137975 | 0 | 2.0 | 15.0 | 0.817107 | -4.209609 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.151515 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 138012 | 0 | 6.0 | 14.0 | 0.757800 | -3.656900 | 18.0 | 0.0 | 0.021753 | 10.0 | 0.070707 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 138022 | 1 | 8.0 | 17.0 | 0.788251 | -3.899039 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.131313 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
6901 rows × 39 columns
# Balanceo de los datos para la validación
ax = data_unseen.c1.value_counts().plot(kind='bar', title='Tipo de evento - DATOS VALIDACIÓN')
ax.set_ylabel('Frecuencia')
# Rotar nombres de atributos
for label in ax.xaxis.get_ticklabels():
label.set_rotation(15)
plt.show()
## restablecemos el índice de ambos conjuntos de datos
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('[+] MODELING_DATA\t', len(data))
print('[+] VALIDATION_DATA\t', len(data_unseen))
[+] MODELING_DATA 131123 [+] VALIDATION_DATA 6901
Para la configuraciones de nuestro clasificador, se emplea el metodo setup() de PyCaret, en el cual se le indica aquellos parámetros necesarios para realizar los modelos:
model_setup = setup(
data=data, target='c1',
verbose=False, n_jobs=-1, session_id=2023051,
# Validación cruzada
fold_strategy = 'kfold', fold = 10, fold_shuffle=True,
# Indicar datos
categorical_features=cat_cols_mapped, numeric_features=num_cols,
# Pipeline de transformaciones
preprocess=False, # custom_pipeline=transformer_pipeline,
# Analisis de componentes principales
pca = True, pca_method='incremental', pca_components=20,
# Ignorar varianzas bajas
low_variance_threshold=None,
# Correlacion de variables
remove_multicollinearity=False, # multicollinearity_threshold=0.95,
# Desbalaceo
fix_imbalance=True,
# Outliers
remove_outliers=False
)
get_config('X_train')
| c7 | c10 | c20 | c21 | c108 | c110 | c113 | c114 | c241 | c41 | ... | c115_RAI | c115_SMK | c115_SNO | c115_nan | c112_BRKN | c112_CLER | c112_OBSC | c112_OCST | c112_SCAT | c112_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 70050 | 2.0 | 16.0 | 0.772454 | -4.665687 | 18.0 | 0.0 | 0.021753 | 10.0 | 0.080808 | 4.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 99633 | 7.0 | 16.0 | 0.787088 | -3.456120 | 18.0 | 0.0 | 0.021753 | 10.0 | 0.080808 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 53298 | 5.0 | 19.0 | 0.755609 | -3.661412 | 18.0 | 3.0 | 0.021753 | 10.0 | 0.080808 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 75469 | 0.0 | 15.0 | 0.891486 | -6.722741 | 18.0 | 0.0 | 2.289829 | 10.0 | 0.000000 | 9.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 81947 | 0.0 | 17.0 | 0.820501 | -5.597020 | 18.0 | 3.0 | 1.431143 | 10.0 | 0.151515 | 3.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41780 | 5.0 | 9.0 | 0.807669 | -3.965966 | 6.0 | 0.0 | 2.289829 | 10.0 | 0.525253 | 3.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 48709 | 3.0 | 9.0 | 0.795584 | -4.274280 | 18.0 | 0.0 | 0.021753 | 10.0 | 0.080808 | 4.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 109604 | 6.0 | 17.0 | 0.809577 | -3.744882 | 18.0 | 0.0 | 2.289829 | 10.0 | 0.050505 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1551 | 5.0 | 13.0 | 0.774466 | -4.240440 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | 3.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 57406 | 6.0 | 13.0 | 0.808625 | -3.280156 | 18.0 | 0.0 | 1.144914 | 10.0 | 0.000000 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
91786 rows × 38 columns
Conjunto de entrenamiento creado por el setup de pycaret.
test_models = models()
test_models_id = [i[0] for i in test_models.iterrows() if i[1]["Turbo"] == True]
print(test_models_id)
test_models
['lr', 'knn', 'nb', 'dt', 'svm', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost', 'dummy']
| Name | Reference | Turbo | |
|---|---|---|---|
| ID | |||
| lr | Logistic Regression | sklearn.linear_model._logistic.LogisticRegression | True |
| knn | K Neighbors Classifier | sklearn.neighbors._classification.KNeighborsCl... | True |
| nb | Naive Bayes | sklearn.naive_bayes.GaussianNB | True |
| dt | Decision Tree Classifier | sklearn.tree._classes.DecisionTreeClassifier | True |
| svm | SVM - Linear Kernel | sklearn.linear_model._stochastic_gradient.SGDC... | True |
| rbfsvm | SVM - Radial Kernel | sklearn.svm._classes.SVC | False |
| gpc | Gaussian Process Classifier | sklearn.gaussian_process._gpc.GaussianProcessC... | False |
| mlp | MLP Classifier | sklearn.neural_network._multilayer_perceptron.... | False |
| ridge | Ridge Classifier | sklearn.linear_model._ridge.RidgeClassifier | True |
| rf | Random Forest Classifier | sklearn.ensemble._forest.RandomForestClassifier | True |
| qda | Quadratic Discriminant Analysis | sklearn.discriminant_analysis.QuadraticDiscrim... | True |
| ada | Ada Boost Classifier | sklearn.ensemble._weight_boosting.AdaBoostClas... | True |
| gbc | Gradient Boosting Classifier | sklearn.ensemble._gb.GradientBoostingClassifier | True |
| lda | Linear Discriminant Analysis | sklearn.discriminant_analysis.LinearDiscrimina... | True |
| et | Extra Trees Classifier | sklearn.ensemble._forest.ExtraTreesClassifier | True |
| xgboost | Extreme Gradient Boosting | xgboost.sklearn.XGBClassifier | True |
| lightgbm | Light Gradient Boosting Machine | lightgbm.sklearn.LGBMClassifier | True |
| catboost | CatBoost Classifier | catboost.core.CatBoostClassifier | True |
| dummy | Dummy Classifier | sklearn.dummy.DummyClassifier | True |
De los modelos disposnibles, se realiza una comprobación de sus predicciones ordenados por la Sensibilidad (Recall) para poder compararlos y escoger el mejor de ellos.
Dada la importancia de la predicción, es importante escoger un modelo cuya proporción de verdaderos positivos en relación con el total de muestras positivas en el conjunto de datos sea lo más elevada posible.
compare_models(cross_validation=False, include=test_models_id, probability_threshold=0.5, sort='Recall')
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| dummy | Dummy Classifier | 0.5662 | 0.5000 | 1.0000 | 0.5662 | 0.7230 | 0.0000 | 0.0000 | 0.0600 |
| catboost | CatBoost Classifier | 0.8071 | 0.8932 | 0.8272 | 0.8313 | 0.8292 | 0.6077 | 0.6077 | 39.6000 |
| xgboost | Extreme Gradient Boosting | 0.8057 | 0.8938 | 0.8246 | 0.8310 | 0.8278 | 0.6050 | 0.6050 | 12.8800 |
| rf | Random Forest Classifier | 0.7867 | 0.8758 | 0.8189 | 0.8071 | 0.8130 | 0.5648 | 0.5648 | 15.5000 |
| lightgbm | Light Gradient Boosting Machine | 0.7984 | 0.8868 | 0.8153 | 0.8262 | 0.8207 | 0.5904 | 0.5904 | 1.9000 |
| et | Extra Trees Classifier | 0.7763 | 0.8666 | 0.8134 | 0.7959 | 0.8046 | 0.5430 | 0.5432 | 13.6400 |
| ada | Ada Boost Classifier | 0.7664 | 0.8560 | 0.8000 | 0.7900 | 0.7950 | 0.5235 | 0.5235 | 6.3600 |
| gbc | Gradient Boosting Classifier | 0.7799 | 0.8693 | 0.7965 | 0.8113 | 0.8038 | 0.5532 | 0.5533 | 24.8500 |
| nb | Naive Bayes | 0.7020 | 0.7903 | 0.7721 | 0.7212 | 0.7458 | 0.3867 | 0.3881 | 0.1500 |
| knn | K Neighbors Classifier | 0.7345 | 0.8100 | 0.7679 | 0.7643 | 0.7661 | 0.4592 | 0.4592 | 0.0800 |
| lr | Logistic Regression | 0.7307 | 0.8180 | 0.7535 | 0.7667 | 0.7601 | 0.4532 | 0.4533 | 20.6000 |
| dt | Decision Tree Classifier | 0.7200 | 0.7153 | 0.7506 | 0.7538 | 0.7522 | 0.4304 | 0.4304 | 2.1500 |
| ridge | Ridge Classifier | 0.7300 | 0.7274 | 0.7469 | 0.7694 | 0.7580 | 0.4528 | 0.4530 | 0.1300 |
| lda | Linear Discriminant Analysis | 0.7297 | 0.8171 | 0.7453 | 0.7700 | 0.7574 | 0.4525 | 0.4528 | 0.9800 |
| svm | SVM - Linear Kernel | 0.7092 | 0.7247 | 0.6072 | 0.8340 | 0.7027 | 0.4316 | 0.4525 | 3.3900 |
| qda | Quadratic Discriminant Analysis | 0.5262 | 0.5722 | 0.2250 | 0.7846 | 0.3497 | 0.1302 | 0.1940 | 0.4500 |
CustomProbabilityThresholdClassifier(classifier=DummyClassifier(constant=None,
random_state=2023051,
strategy='prior'),
constant=None, probability_threshold=0.5,
random_state=2023051, strategy='prior')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. CustomProbabilityThresholdClassifier(classifier=DummyClassifier(constant=None,
random_state=2023051,
strategy='prior'),
constant=None, probability_threshold=0.5,
random_state=2023051, strategy='prior')DummyClassifier(constant=None, random_state=2023051, strategy='prior')
DummyClassifier(random_state=2023051)
Teniendo en cuenta los resultados obtenidos, en cuanto a métricas y tiempos de ejecución, se decide escoger un top 3 para crear los modelos, poder ajustarlos y finalmente realizar el modelo con el mejor de ellos. En base a ello, se escoge:
A continuación, se muestra la comparación de los modelos, esta vez, empleando la validación cruzada con 10 folds como se ha indicado en los ajustes.
compare_models(cross_validation=True, include=['xgboost', 'catboost', 'rf'], probability_threshold=0.5, sort='Recall')
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| xgboost | Extreme Gradient Boosting | 0.8069 | 0.8948 | 0.8291 | 0.8297 | 0.8294 | 0.6069 | 0.6069 | 14.6170 |
| catboost | CatBoost Classifier | 0.8056 | 0.8944 | 0.8288 | 0.8281 | 0.8284 | 0.6043 | 0.6043 | 32.9740 |
| rf | Random Forest Classifier | 0.7895 | 0.8792 | 0.8191 | 0.8109 | 0.8150 | 0.5708 | 0.5708 | 15.1300 |
CustomProbabilityThresholdClassifier(base_score=None, booster='gbtree',
callbacks=None,
classifier=XGBClassifier(base_score=None,
booster='gbtree',
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None,
gamma=None,
gpu_id=None,
grow_policy=No...
gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1,
num_parallel_tree=None,
objective='binary:logistic', ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. CustomProbabilityThresholdClassifier(base_score=None, booster='gbtree',
callbacks=None,
classifier=XGBClassifier(base_score=None,
booster='gbtree',
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None,
gamma=None,
gpu_id=None,
grow_policy=No...
gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1,
num_parallel_tree=None,
objective='binary:logistic', ...)XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=None,
objective='binary:logistic', predictor=None, ...)XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=None,
predictor=None, random_state=2023051, ...)En base a los resultados anteriores, se escoge el modelo Extreme Gradient Boosting para realizar las predicciones.
models = []
results = []
for i in np.arange(0.1,1,0.1):
model = create_model('xgboost', cross_validation=True, learning_rate = i, verbose=False)
model_results = pull().loc[['Mean']]
models.append(model)
results.append(model_results)
results = pd.concat(results, axis=0)
results.index = np.arange(0.1,1,0.1)
results.plot()
<AxesSubplot: >
results.style.highlight_max(color = 'lightgreen', axis = 0)
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| 0.100000 | 0.795600 | 0.885200 | 0.815500 | 0.822100 | 0.818800 | 0.584500 | 0.584500 |
| 0.200000 | 0.803200 | 0.891700 | 0.824000 | 0.827700 | 0.825800 | 0.599800 | 0.599800 |
| 0.300000 | 0.806900 | 0.894800 | 0.829100 | 0.829700 | 0.829400 | 0.606900 | 0.606900 |
| 0.400000 | 0.809900 | 0.896500 | 0.833000 | 0.831500 | 0.832200 | 0.612900 | 0.612900 |
| 0.500000 | 0.810200 | 0.897000 | 0.834000 | 0.831300 | 0.832600 | 0.613400 | 0.613400 |
| 0.600000 | 0.808900 | 0.895500 | 0.832900 | 0.830100 | 0.831500 | 0.610800 | 0.610800 |
| 0.700000 | 0.807100 | 0.894000 | 0.832500 | 0.827700 | 0.830100 | 0.606900 | 0.606900 |
| 0.800000 | 0.805600 | 0.891100 | 0.830900 | 0.826600 | 0.828700 | 0.603900 | 0.604000 |
| 0.900000 | 0.803300 | 0.889400 | 0.828900 | 0.824600 | 0.826700 | 0.599200 | 0.599200 |
Tras la creación del modelo teniendo en cuenta su tasa de aprendizaje, se establece esta en 0.5.
model = create_model('xgboost', cross_validation=True, learning_rate = 0.5)
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.8133 | 0.8967 | 0.8347 | 0.8341 | 0.8344 | 0.6204 | 0.6204 |
| 1 | 0.8096 | 0.8969 | 0.8351 | 0.8303 | 0.8327 | 0.6117 | 0.6117 |
| 2 | 0.8059 | 0.8913 | 0.8289 | 0.8270 | 0.8280 | 0.6052 | 0.6052 |
| 3 | 0.8093 | 0.8975 | 0.8339 | 0.8291 | 0.8315 | 0.6120 | 0.6120 |
| 4 | 0.8138 | 0.9006 | 0.8361 | 0.8340 | 0.8351 | 0.6213 | 0.6213 |
| 5 | 0.8133 | 0.9004 | 0.8331 | 0.8386 | 0.8358 | 0.6194 | 0.6194 |
| 6 | 0.8089 | 0.8975 | 0.8293 | 0.8334 | 0.8313 | 0.6109 | 0.6109 |
| 7 | 0.8109 | 0.8977 | 0.8345 | 0.8348 | 0.8346 | 0.6137 | 0.6137 |
| 8 | 0.8056 | 0.8942 | 0.8346 | 0.8225 | 0.8285 | 0.6042 | 0.6043 |
| 9 | 0.8114 | 0.8977 | 0.8398 | 0.8291 | 0.8344 | 0.6154 | 0.6155 |
| Mean | 0.8102 | 0.8970 | 0.8340 | 0.8313 | 0.8326 | 0.6134 | 0.6134 |
| Std | 0.0028 | 0.0026 | 0.0030 | 0.0044 | 0.0026 | 0.0056 | 0.0056 |
model
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.5, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=None,
objective='binary:logistic', predictor=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.5, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=None,
objective='binary:logistic', predictor=None, ...)Ya que se ha seleccionado el modelo en base a Recall, el parametro de Accuracy es más bajo, por lo que se decide ajustar el modelo teniendo en cuenta este parámetro.
tuned_model = tune_model(model, optimize='Accuracy', n_iter=20, choose_better=True) # , custom_grid=param_grid)
tuned_model
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.7511 | 0.8943 | 0.9582 | 0.7055 | 0.8127 | 0.4661 | 0.5171 |
| 1 | 0.7549 | 0.8890 | 0.9601 | 0.7101 | 0.8164 | 0.4715 | 0.5225 |
| 2 | 0.7515 | 0.8885 | 0.9604 | 0.7053 | 0.8133 | 0.4667 | 0.5190 |
| 3 | 0.7508 | 0.8931 | 0.9587 | 0.7054 | 0.8127 | 0.4652 | 0.5166 |
| 4 | 0.7534 | 0.8950 | 0.9625 | 0.7064 | 0.8148 | 0.4705 | 0.5236 |
| 5 | 0.7567 | 0.8971 | 0.9591 | 0.7133 | 0.8181 | 0.4738 | 0.5235 |
| 6 | 0.7532 | 0.8906 | 0.9582 | 0.7093 | 0.8152 | 0.4677 | 0.5180 |
| 7 | 0.7608 | 0.8951 | 0.9623 | 0.7167 | 0.8215 | 0.4817 | 0.5321 |
| 8 | 0.7447 | 0.8895 | 0.9607 | 0.6986 | 0.8090 | 0.4518 | 0.5069 |
| 9 | 0.7485 | 0.8924 | 0.9613 | 0.7032 | 0.8122 | 0.4581 | 0.5123 |
| Mean | 0.7526 | 0.8925 | 0.9601 | 0.7074 | 0.8146 | 0.4673 | 0.5191 |
| Std | 0.0042 | 0.0028 | 0.0015 | 0.0049 | 0.0033 | 0.0078 | 0.0065 |
Fitting 10 folds for each of 20 candidates, totalling 200 fits Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.5, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=None,
objective='binary:logistic', predictor=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.5, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=None,
objective='binary:logistic', predictor=None, ...)Aunque se observa como Recall sube bastantes puntos, el resto de estimadores bajan, y además ACcuracy baja en exceso, es por esto que se decide finalizar el modelo, con el modelo prmiero sin su ajuste.
plots = ['feature_all', 'auc', 'confusion_matrix', 'pr']
list(map(lambda x: plot_model(model, plot=x), plots))
[None, None, None, None]
Con el mejor de los modelos seleccionado y ajustado, se ajusta el modelo al conjunto de todos los datos, incluyendo test y train. Realizando finalmente una predicción sobre el conjunto de datos que se extrajo en un principio y no se han usado durante todo el proceso de creación del modelo.
final_model = finalize_model(model)
final_model
Pipeline(memory=FastMemory(location=C:\Users\vicsi\AppData\Local\Temp\joblib),
steps=[('clean_column_names',
TransformerWrapper(exclude=None, include=None,
transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
('actual_estimator',
XGBClassifier(base_score=None, booster='gbtree',
callbacks=None, colsample_bylevel=None,
colsample_bynode=None, colsam...
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.5,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100,
n_jobs=-1, num_parallel_tree=None,
objective='binary:logistic', predictor=None, ...))],
verbose=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(memory=FastMemory(location=C:\Users\vicsi\AppData\Local\Temp\joblib),
steps=[('clean_column_names',
TransformerWrapper(exclude=None, include=None,
transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
('actual_estimator',
XGBClassifier(base_score=None, booster='gbtree',
callbacks=None, colsample_bylevel=None,
colsample_bynode=None, colsam...
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.5,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100,
n_jobs=-1, num_parallel_tree=None,
objective='binary:logistic', predictor=None, ...))],
verbose=False)TransformerWrapper(exclude=None, include=None,
transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))CleanColumnNames()
CleanColumnNames()
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.5, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=None,
predictor=None, random_state=2023051, ...)Ya con el modelo creado, pasamos a realizar las predicciones con los datos que se ha reservado para ello.
# Prediccion sobre el conjunto de test
predict_model(final_model)
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|---|
| 0 | Extreme Gradient Boosting | 0.8555 | 0.9335 | 0.8748 | 0.8706 | 0.8727 | 0.7056 | 0.7057 |
| c7 | c10 | c20 | c21 | c108 | c110 | c113 | c114 | c241 | c41 | ... | c115_nan | c112_BRKN | c112_CLER | c112_OBSC | c112_OCST | c112_SCAT | c112_nan | c1 | prediction_label | prediction_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 101912 | 11.0 | 19.0 | 0.793256 | -3.520791 | 9.0 | 3.0 | 0.572457 | 3.0 | 0.080808 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 1 | 0.6099 |
| 19654 | 9.0 | 13.0 | 0.813894 | -3.824593 | 18.0 | 0.0 | 1.431143 | 10.0 | 0.070707 | 9.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 0.6347 |
| 68875 | 2.0 | 16.0 | 0.753070 | -3.720067 | 22.0 | 0.0 | 0.021753 | 10.0 | 0.080808 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 0.9855 |
| 13449 | 6.0 | 12.0 | 0.745993 | -4.331430 | 18.0 | 0.0 | 2.289829 | 10.0 | 0.111111 | 7.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | 0.7098 |
| 123737 | 2.0 | 17.0 | 0.794737 | -5.540621 | 18.0 | 0.0 | 1.431143 | 10.0 | 0.060606 | 6.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.9350 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 112956 | 5.0 | 11.0 | 0.780170 | -4.720958 | 6.0 | 0.0 | 2.289829 | 10.0 | 0.272727 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 0.8259 |
| 124863 | 3.0 | 15.0 | 0.795643 | -3.475672 | 18.0 | 4.0 | 0.021753 | 10.0 | 0.080808 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 0.9927 |
| 61932 | 9.0 | 9.0 | 0.795052 | -4.077260 | 18.0 | 0.0 | 0.021753 | 10.0 | 0.080808 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 0.9887 |
| 72537 | 4.0 | 18.0 | 0.780755 | -4.809692 | 18.0 | 1.0 | 0.021753 | 8.0 | 0.080808 | 7.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 0.8285 |
| 123469 | 8.0 | 13.0 | 0.821679 | -4.138170 | 18.0 | 0.0 | 0.021753 | 10.0 | 0.080808 | 6.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 0.8027 |
39337 rows × 41 columns
# Predicción sobre el conjunto de datos unseen
predict_model(final_model, data=data_unseen)
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|---|
| 0 | Extreme Gradient Boosting | 0.8206 | 0.9051 | 0.8435 | 0.8396 | 0.8415 | 0.6349 | 0.6349 |
| c7 | c10 | c20 | c21 | c108 | c110 | c113 | c114 | c241 | c41 | ... | c115_nan | c112_BRKN | c112_CLER | c112_OBSC | c112_OCST | c112_SCAT | c112_nan | c1 | prediction_label | prediction_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 13.0 | 0.745894 | -3.617045 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | 3.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0.9997 |
| 1 | 0.0 | 13.0 | 0.786976 | -3.841888 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | 9.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0.9997 |
| 2 | 0.0 | 19.0 | 0.774605 | -4.586353 | 3.0 | 1.0 | 0.021753 | 5.0 | 0.080808 | 1.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1.0000 |
| 3 | 4.0 | 13.0 | 0.895473 | -6.670854 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | 3.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0.9959 |
| 4 | 4.0 | 13.0 | 0.815791 | -3.429049 | 18.0 | 0.0 | 0.021753 | 7.0 | 0.080808 | 4.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0.9992 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6896 | 6.0 | 17.0 | 0.823314 | -5.031528 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.111111 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0.5724 |
| 6897 | 5.0 | 11.0 | 0.800877 | -3.344075 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.262626 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 0.8345 |
| 6898 | 2.0 | 15.0 | 0.817107 | -4.209609 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.151515 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 1 | 0.5924 |
| 6899 | 6.0 | 14.0 | 0.757800 | -3.656900 | 18.0 | 0.0 | 0.021753 | 10.0 | 0.070707 | 9.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 1 | 0.9619 |
| 6900 | 8.0 | 17.0 | 0.788251 | -3.899039 | 6.0 | 0.0 | 0.021753 | 10.0 | 0.131313 | 3.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 0.6769 |
6901 rows × 41 columns
Tras todo el proceso de modelado y ajuste del mismo, para poder utilizar el modelo en un entorno de producción, es necesario guardar el mismo. Para ello, se emplea las propias herramientas de PyCaret.
# Registrar la hora actual para el guardado del modelo
date = datetime.now()
date = date.strftime('%Y%m%d_%H%M')
# Guardado del modelo
model_name = 'Forecast_model'
save_model(final_model, f'{model_info_dir}{model_name}_{date}')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=FastMemory(location=C:\Users\vicsi\AppData\Local\Temp\joblib),
steps=[('clean_column_names',
TransformerWrapper(exclude=None, include=None,
transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
('actual_estimator',
XGBClassifier(base_score=None, booster='gbtree',
callbacks=None, colsample_bylevel=None,
colsample_bynode=None, colsam...
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.5,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100,
n_jobs=-1, num_parallel_tree=None,
objective='binary:logistic', predictor=None, ...))],
verbose=False),
'Forecast/Forecast_model_20230512_1239.pkl')